|
{ |
|
"best_metric": 0.3474305272102356, |
|
"best_model_checkpoint": "./results\\checkpoint-6398", |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 6398, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 26.542728424072266, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 2.738, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 24.76641273498535, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.4407, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 30.07474708557129, |
|
"learning_rate": 3e-06, |
|
"loss": 2.4597, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 24.434463500976562, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.3404, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 23.83124542236328, |
|
"learning_rate": 5e-06, |
|
"loss": 2.1791, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 22.50973129272461, |
|
"learning_rate": 6e-06, |
|
"loss": 2.1673, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 27.750638961791992, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 2.2015, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 25.283756256103516, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.0006, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 25.05742835998535, |
|
"learning_rate": 9e-06, |
|
"loss": 1.7402, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 25.344383239746094, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7645, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 28.333459854125977, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 2.1037, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 12.90853500366211, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.6745, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 25.75482940673828, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 1.8785, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 19.2341251373291, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 1.6035, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 22.45606803894043, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.5633, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 14.334227561950684, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.2339, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 15.943977355957031, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 1.4193, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 21.580835342407227, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.3797, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 18.85316276550293, |
|
"learning_rate": 1.9e-05, |
|
"loss": 1.2302, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 21.618574142456055, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1087, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 33.01541519165039, |
|
"learning_rate": 2.1e-05, |
|
"loss": 1.9294, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 18.221654891967773, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.2177, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 25.145170211791992, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 1.3802, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 18.087400436401367, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.3255, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 21.010631561279297, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.5414, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 10.053231239318848, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.9453, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 33.4912223815918, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 1.2211, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 20.72431182861328, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.9749, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 22.565200805664062, |
|
"learning_rate": 2.9e-05, |
|
"loss": 1.179, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 10.775777816772461, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2001, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 7.4110307693481445, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.9093, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 30.766653060913086, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.2831, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 20.368473052978516, |
|
"learning_rate": 3.3e-05, |
|
"loss": 1.1205, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 46.62571334838867, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.036, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 25.56611442565918, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.3756, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 18.900516510009766, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.2767, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 23.277311325073242, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.8536, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 35.67240524291992, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.6216, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 8.9258451461792, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 1.0068, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 19.44056510925293, |
|
"learning_rate": 4e-05, |
|
"loss": 1.9435, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 19.775850296020508, |
|
"learning_rate": 4.1e-05, |
|
"loss": 1.2093, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 20.82231330871582, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.1288, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 7.916687488555908, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.9289, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 24.404117584228516, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.2781, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 17.555295944213867, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.1765, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 19.7391357421875, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.1167, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 19.916635513305664, |
|
"learning_rate": 4.7e-05, |
|
"loss": 1.0785, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 21.901708602905273, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.9351, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 21.167715072631836, |
|
"learning_rate": 4.9e-05, |
|
"loss": 1.3726, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 22.528865814208984, |
|
"learning_rate": 5e-05, |
|
"loss": 1.1176, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 23.06756019592285, |
|
"learning_rate": 4.991522550016955e-05, |
|
"loss": 1.2415, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 18.94842529296875, |
|
"learning_rate": 4.98304510003391e-05, |
|
"loss": 1.3143, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 23.966211318969727, |
|
"learning_rate": 4.9745676500508646e-05, |
|
"loss": 1.048, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 18.198823928833008, |
|
"learning_rate": 4.96609020006782e-05, |
|
"loss": 1.0476, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 11.191575050354004, |
|
"learning_rate": 4.9576127500847744e-05, |
|
"loss": 1.3924, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 42.68156814575195, |
|
"learning_rate": 4.94913530010173e-05, |
|
"loss": 1.3035, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 26.126558303833008, |
|
"learning_rate": 4.940657850118684e-05, |
|
"loss": 1.0578, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 8.13721752166748, |
|
"learning_rate": 4.9321804001356395e-05, |
|
"loss": 0.9093, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 38.31271743774414, |
|
"learning_rate": 4.923702950152595e-05, |
|
"loss": 0.8466, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 43.53594970703125, |
|
"learning_rate": 4.915225500169549e-05, |
|
"loss": 1.4861, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 32.57273864746094, |
|
"learning_rate": 4.9067480501865045e-05, |
|
"loss": 1.2987, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 32.26193618774414, |
|
"learning_rate": 4.898270600203459e-05, |
|
"loss": 1.3402, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 17.1823673248291, |
|
"learning_rate": 4.8897931502204143e-05, |
|
"loss": 1.0338, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 17.780929565429688, |
|
"learning_rate": 4.881315700237369e-05, |
|
"loss": 1.0368, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 18.093929290771484, |
|
"learning_rate": 4.8728382502543235e-05, |
|
"loss": 0.9465, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 23.80923080444336, |
|
"learning_rate": 4.864360800271279e-05, |
|
"loss": 0.9293, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 25.071569442749023, |
|
"learning_rate": 4.855883350288233e-05, |
|
"loss": 1.4204, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 17.217226028442383, |
|
"learning_rate": 4.8474059003051885e-05, |
|
"loss": 1.0646, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 16.813901901245117, |
|
"learning_rate": 4.838928450322143e-05, |
|
"loss": 0.9068, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 12.774974822998047, |
|
"learning_rate": 4.830451000339098e-05, |
|
"loss": 1.0567, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 9.550601959228516, |
|
"learning_rate": 4.821973550356053e-05, |
|
"loss": 0.9253, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 16.764188766479492, |
|
"learning_rate": 4.8134961003730075e-05, |
|
"loss": 1.1694, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 17.823640823364258, |
|
"learning_rate": 4.805018650389963e-05, |
|
"loss": 0.9413, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.949650287628174, |
|
"learning_rate": 4.796541200406918e-05, |
|
"loss": 1.2831, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 31.663116455078125, |
|
"learning_rate": 4.788063750423873e-05, |
|
"loss": 0.8206, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 19.267797470092773, |
|
"learning_rate": 4.779586300440828e-05, |
|
"loss": 0.8309, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 18.33287239074707, |
|
"learning_rate": 4.771108850457783e-05, |
|
"loss": 0.8532, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 14.53322982788086, |
|
"learning_rate": 4.7626314004747376e-05, |
|
"loss": 1.1592, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 19.1998291015625, |
|
"learning_rate": 4.754153950491692e-05, |
|
"loss": 0.9742, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 26.773860931396484, |
|
"learning_rate": 4.7456765005086474e-05, |
|
"loss": 1.2489, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 10.488304138183594, |
|
"learning_rate": 4.737199050525602e-05, |
|
"loss": 1.2353, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 26.352331161499023, |
|
"learning_rate": 4.728721600542557e-05, |
|
"loss": 1.1808, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 12.658868789672852, |
|
"learning_rate": 4.720244150559512e-05, |
|
"loss": 1.0206, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 23.39665985107422, |
|
"learning_rate": 4.711766700576467e-05, |
|
"loss": 0.9055, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 21.914995193481445, |
|
"learning_rate": 4.7032892505934216e-05, |
|
"loss": 1.1839, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 14.218494415283203, |
|
"learning_rate": 4.694811800610376e-05, |
|
"loss": 0.6352, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 31.233783721923828, |
|
"learning_rate": 4.6863343506273314e-05, |
|
"loss": 0.7302, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 24.867542266845703, |
|
"learning_rate": 4.677856900644286e-05, |
|
"loss": 1.1345, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 22.696847915649414, |
|
"learning_rate": 4.669379450661241e-05, |
|
"loss": 1.3002, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 17.476242065429688, |
|
"learning_rate": 4.6609020006781964e-05, |
|
"loss": 1.3256, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 21.66083526611328, |
|
"learning_rate": 4.652424550695152e-05, |
|
"loss": 0.914, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 20.41225242614746, |
|
"learning_rate": 4.643947100712106e-05, |
|
"loss": 0.7292, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 24.819978713989258, |
|
"learning_rate": 4.635469650729061e-05, |
|
"loss": 1.2431, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 33.943824768066406, |
|
"learning_rate": 4.626992200746016e-05, |
|
"loss": 1.6141, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 17.183805465698242, |
|
"learning_rate": 4.6185147507629706e-05, |
|
"loss": 0.7778, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.102858781814575, |
|
"learning_rate": 4.610037300779926e-05, |
|
"loss": 0.5896, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 23.15467071533203, |
|
"learning_rate": 4.6015598507968804e-05, |
|
"loss": 1.4732, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 23.75440216064453, |
|
"learning_rate": 4.593082400813836e-05, |
|
"loss": 1.2125, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 31.520126342773438, |
|
"learning_rate": 4.58460495083079e-05, |
|
"loss": 0.7964, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 14.303851127624512, |
|
"learning_rate": 4.576127500847745e-05, |
|
"loss": 0.8121, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 12.356579780578613, |
|
"learning_rate": 4.5676500508647e-05, |
|
"loss": 0.8765, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 36.80412292480469, |
|
"learning_rate": 4.5591726008816546e-05, |
|
"loss": 1.6265, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 32.955177307128906, |
|
"learning_rate": 4.55069515089861e-05, |
|
"loss": 0.7252, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 18.75346565246582, |
|
"learning_rate": 4.5422177009155644e-05, |
|
"loss": 0.7021, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 10.651359558105469, |
|
"learning_rate": 4.5337402509325197e-05, |
|
"loss": 1.0639, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 15.532186508178711, |
|
"learning_rate": 4.525262800949475e-05, |
|
"loss": 0.804, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 60.16484451293945, |
|
"learning_rate": 4.5167853509664295e-05, |
|
"loss": 0.7853, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 19.321165084838867, |
|
"learning_rate": 4.508307900983385e-05, |
|
"loss": 0.8331, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.306361436843872, |
|
"learning_rate": 4.499830451000339e-05, |
|
"loss": 0.9297, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 18.4438419342041, |
|
"learning_rate": 4.4913530010172945e-05, |
|
"loss": 1.4051, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.134817123413086, |
|
"learning_rate": 4.482875551034249e-05, |
|
"loss": 0.5885, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 34.41851043701172, |
|
"learning_rate": 4.474398101051204e-05, |
|
"loss": 1.1314, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.790000915527344, |
|
"learning_rate": 4.465920651068159e-05, |
|
"loss": 1.2724, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 11.602365493774414, |
|
"learning_rate": 4.4574432010851135e-05, |
|
"loss": 0.7734, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 23.654390335083008, |
|
"learning_rate": 4.448965751102069e-05, |
|
"loss": 0.9145, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 37.119022369384766, |
|
"learning_rate": 4.440488301119023e-05, |
|
"loss": 1.0029, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 22.183420181274414, |
|
"learning_rate": 4.4320108511359785e-05, |
|
"loss": 0.9291, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 25.14228630065918, |
|
"learning_rate": 4.423533401152933e-05, |
|
"loss": 1.0194, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 15.864004135131836, |
|
"learning_rate": 4.415055951169888e-05, |
|
"loss": 1.0367, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.8865504264831543, |
|
"learning_rate": 4.4065785011868436e-05, |
|
"loss": 0.5909, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 20.187772750854492, |
|
"learning_rate": 4.398101051203798e-05, |
|
"loss": 1.23, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.4391849040985107, |
|
"learning_rate": 4.3896236012207534e-05, |
|
"loss": 0.927, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 27.003379821777344, |
|
"learning_rate": 4.381146151237708e-05, |
|
"loss": 1.0672, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 9.609984397888184, |
|
"learning_rate": 4.372668701254663e-05, |
|
"loss": 0.7269, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 25.993255615234375, |
|
"learning_rate": 4.364191251271618e-05, |
|
"loss": 0.8122, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 10.979281425476074, |
|
"learning_rate": 4.355713801288573e-05, |
|
"loss": 0.8113, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.965178966522217, |
|
"learning_rate": 4.3472363513055276e-05, |
|
"loss": 0.656, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 27.325172424316406, |
|
"learning_rate": 4.338758901322482e-05, |
|
"loss": 1.011, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.6009902954101562, |
|
"learning_rate": 4.3302814513394374e-05, |
|
"loss": 0.9724, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 56.60973358154297, |
|
"learning_rate": 4.321804001356392e-05, |
|
"loss": 1.0712, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 11.2178955078125, |
|
"learning_rate": 4.313326551373347e-05, |
|
"loss": 1.188, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 43.168479919433594, |
|
"learning_rate": 4.304849101390302e-05, |
|
"loss": 0.9892, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 22.50556755065918, |
|
"learning_rate": 4.296371651407257e-05, |
|
"loss": 0.8076, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.59372615814209, |
|
"learning_rate": 4.2878942014242116e-05, |
|
"loss": 0.5126, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 14.512517929077148, |
|
"learning_rate": 4.279416751441167e-05, |
|
"loss": 0.7622, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 61.02300262451172, |
|
"learning_rate": 4.270939301458122e-05, |
|
"loss": 1.2306, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 17.68853187561035, |
|
"learning_rate": 4.2624618514750766e-05, |
|
"loss": 1.1169, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 17.228788375854492, |
|
"learning_rate": 4.253984401492032e-05, |
|
"loss": 0.7908, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8683578968048096, |
|
"learning_rate": 4.2455069515089864e-05, |
|
"loss": 0.595, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 33.53107833862305, |
|
"learning_rate": 4.237029501525942e-05, |
|
"loss": 0.7539, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 17.268796920776367, |
|
"learning_rate": 4.228552051542896e-05, |
|
"loss": 0.9071, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.079007625579834, |
|
"learning_rate": 4.220074601559851e-05, |
|
"loss": 1.3714, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 14.3380765914917, |
|
"learning_rate": 4.211597151576806e-05, |
|
"loss": 0.8167, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 21.197294235229492, |
|
"learning_rate": 4.2031197015937606e-05, |
|
"loss": 0.8644, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 12.999835968017578, |
|
"learning_rate": 4.194642251610716e-05, |
|
"loss": 1.1938, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 37.84367752075195, |
|
"learning_rate": 4.1861648016276704e-05, |
|
"loss": 0.8975, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 24.756757736206055, |
|
"learning_rate": 4.1776873516446257e-05, |
|
"loss": 0.7786, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 35.1243896484375, |
|
"learning_rate": 4.16920990166158e-05, |
|
"loss": 0.8236, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 7.9437575340271, |
|
"learning_rate": 4.160732451678535e-05, |
|
"loss": 0.696, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 36.57485580444336, |
|
"learning_rate": 4.15225500169549e-05, |
|
"loss": 1.2849, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 17.662107467651367, |
|
"learning_rate": 4.143777551712445e-05, |
|
"loss": 1.0252, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 32.92692565917969, |
|
"learning_rate": 4.1353001017294005e-05, |
|
"loss": 1.1483, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 23.67911720275879, |
|
"learning_rate": 4.126822651746355e-05, |
|
"loss": 0.8862, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 10.78515625, |
|
"learning_rate": 4.11834520176331e-05, |
|
"loss": 0.7059, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 10.892401695251465, |
|
"learning_rate": 4.109867751780265e-05, |
|
"loss": 1.0229, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 31.348299026489258, |
|
"learning_rate": 4.1013903017972195e-05, |
|
"loss": 0.8373, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 18.273740768432617, |
|
"learning_rate": 4.092912851814175e-05, |
|
"loss": 1.6494, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.3065638542175293, |
|
"learning_rate": 4.084435401831129e-05, |
|
"loss": 1.4762, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 20.082490921020508, |
|
"learning_rate": 4.0759579518480845e-05, |
|
"loss": 0.6694, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 22.293842315673828, |
|
"learning_rate": 4.067480501865039e-05, |
|
"loss": 1.5581, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 10.693699836730957, |
|
"learning_rate": 4.059003051881994e-05, |
|
"loss": 1.1067, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 32.38418960571289, |
|
"learning_rate": 4.050525601898949e-05, |
|
"loss": 0.6269, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 11.499810218811035, |
|
"learning_rate": 4.0420481519159035e-05, |
|
"loss": 0.4731, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 27.077342987060547, |
|
"learning_rate": 4.033570701932859e-05, |
|
"loss": 1.1482, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 32.90080261230469, |
|
"learning_rate": 4.025093251949813e-05, |
|
"loss": 0.7159, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5575653910636902, |
|
"learning_rate": 4.0166158019667685e-05, |
|
"loss": 1.1091, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.83486270904541, |
|
"learning_rate": 4.008138351983724e-05, |
|
"loss": 0.6591, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.872043609619141, |
|
"learning_rate": 3.999660902000678e-05, |
|
"loss": 1.4829, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 46.48579406738281, |
|
"learning_rate": 3.9911834520176336e-05, |
|
"loss": 0.9756, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.8250547647476196, |
|
"learning_rate": 3.982706002034588e-05, |
|
"loss": 1.029, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 14.628582000732422, |
|
"learning_rate": 3.9742285520515434e-05, |
|
"loss": 0.8788, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 16.0622615814209, |
|
"learning_rate": 3.965751102068498e-05, |
|
"loss": 0.9625, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5123465061187744, |
|
"learning_rate": 3.957273652085453e-05, |
|
"loss": 0.3242, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 24.70962142944336, |
|
"learning_rate": 3.948796202102408e-05, |
|
"loss": 1.0982, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.3634880483150482, |
|
"learning_rate": 3.940318752119362e-05, |
|
"loss": 0.2891, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.15528139472007751, |
|
"learning_rate": 3.9318413021363176e-05, |
|
"loss": 0.964, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 49.00790023803711, |
|
"learning_rate": 3.923363852153272e-05, |
|
"loss": 1.1447, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 17.20952796936035, |
|
"learning_rate": 3.9148864021702274e-05, |
|
"loss": 0.2853, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 41.41338348388672, |
|
"learning_rate": 3.906408952187182e-05, |
|
"loss": 0.6706, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 18.779157638549805, |
|
"learning_rate": 3.897931502204137e-05, |
|
"loss": 0.529, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 14.820104598999023, |
|
"learning_rate": 3.889454052221092e-05, |
|
"loss": 0.8685, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 23.525177001953125, |
|
"learning_rate": 3.880976602238047e-05, |
|
"loss": 0.6292, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.442511647939682, |
|
"learning_rate": 3.872499152255002e-05, |
|
"loss": 0.9998, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.724640846252441, |
|
"learning_rate": 3.864021702271957e-05, |
|
"loss": 1.1116, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 26.15089225769043, |
|
"learning_rate": 3.855544252288912e-05, |
|
"loss": 1.109, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 25.109575271606445, |
|
"learning_rate": 3.8470668023058666e-05, |
|
"loss": 0.9564, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 6.969759464263916, |
|
"learning_rate": 3.838589352322822e-05, |
|
"loss": 0.9998, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.33139169216156, |
|
"learning_rate": 3.8301119023397764e-05, |
|
"loss": 1.0358, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 18.717729568481445, |
|
"learning_rate": 3.821634452356731e-05, |
|
"loss": 0.8999, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.792151927947998, |
|
"learning_rate": 3.813157002373686e-05, |
|
"loss": 0.3362, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 31.383317947387695, |
|
"learning_rate": 3.804679552390641e-05, |
|
"loss": 1.3567, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 17.265098571777344, |
|
"learning_rate": 3.796202102407596e-05, |
|
"loss": 1.3984, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 28.290605545043945, |
|
"learning_rate": 3.7877246524245506e-05, |
|
"loss": 0.9521, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 25.10551643371582, |
|
"learning_rate": 3.779247202441506e-05, |
|
"loss": 0.8199, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 15.570342063903809, |
|
"learning_rate": 3.7707697524584604e-05, |
|
"loss": 0.6976, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 48.913543701171875, |
|
"learning_rate": 3.762292302475415e-05, |
|
"loss": 1.1964, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 38.607730865478516, |
|
"learning_rate": 3.75381485249237e-05, |
|
"loss": 1.2215, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 61.75456619262695, |
|
"learning_rate": 3.7453374025093255e-05, |
|
"loss": 1.1017, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.9896836280822754, |
|
"learning_rate": 3.736859952526281e-05, |
|
"loss": 0.6199, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 22.329191207885742, |
|
"learning_rate": 3.728382502543235e-05, |
|
"loss": 0.6508, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 44.48137283325195, |
|
"learning_rate": 3.7199050525601905e-05, |
|
"loss": 0.3921, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 29.464078903198242, |
|
"learning_rate": 3.711427602577145e-05, |
|
"loss": 0.702, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.4076429605484009, |
|
"learning_rate": 3.7029501525940996e-05, |
|
"loss": 0.6526, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 8.90776538848877, |
|
"learning_rate": 3.694472702611055e-05, |
|
"loss": 0.5163, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.1676784753799438, |
|
"learning_rate": 3.6859952526280095e-05, |
|
"loss": 0.3613, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 17.235055923461914, |
|
"learning_rate": 3.677517802644965e-05, |
|
"loss": 0.6757, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 76.95849609375, |
|
"learning_rate": 3.669040352661919e-05, |
|
"loss": 0.6235, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 60.8438720703125, |
|
"learning_rate": 3.6605629026788745e-05, |
|
"loss": 1.293, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 52.46332550048828, |
|
"learning_rate": 3.652085452695829e-05, |
|
"loss": 1.0197, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.09444449096918106, |
|
"learning_rate": 3.6436080027127836e-05, |
|
"loss": 0.7193, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.06203605979681015, |
|
"learning_rate": 3.635130552729739e-05, |
|
"loss": 0.9315, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 18.52464485168457, |
|
"learning_rate": 3.6266531027466935e-05, |
|
"loss": 0.8383, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 21.279346466064453, |
|
"learning_rate": 3.618175652763649e-05, |
|
"loss": 0.4203, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 33.78806686401367, |
|
"learning_rate": 3.609698202780604e-05, |
|
"loss": 0.5023, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.57900857925415, |
|
"learning_rate": 3.601220752797559e-05, |
|
"loss": 0.8179, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 8.65097427368164, |
|
"learning_rate": 3.592743302814514e-05, |
|
"loss": 1.3782, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.10031997412443161, |
|
"learning_rate": 3.584265852831468e-05, |
|
"loss": 0.9372, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 26.388620376586914, |
|
"learning_rate": 3.5757884028484236e-05, |
|
"loss": 1.1846, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 12.027292251586914, |
|
"learning_rate": 3.567310952865378e-05, |
|
"loss": 0.4438, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.0509867668151855, |
|
"learning_rate": 3.5588335028823334e-05, |
|
"loss": 0.7817, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 18.15105438232422, |
|
"learning_rate": 3.550356052899288e-05, |
|
"loss": 0.9115, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.0213444232940674, |
|
"learning_rate": 3.541878602916243e-05, |
|
"loss": 0.4687, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 71.69523620605469, |
|
"learning_rate": 3.533401152933198e-05, |
|
"loss": 0.7518, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 86.0584945678711, |
|
"learning_rate": 3.524923702950152e-05, |
|
"loss": 1.1291, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.7845587730407715, |
|
"learning_rate": 3.5164462529671076e-05, |
|
"loss": 1.0255, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 10.846942901611328, |
|
"learning_rate": 3.507968802984062e-05, |
|
"loss": 0.6766, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 50.255889892578125, |
|
"learning_rate": 3.4994913530010174e-05, |
|
"loss": 0.4563, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 20.510534286499023, |
|
"learning_rate": 3.491013903017972e-05, |
|
"loss": 1.1467, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 55.81074905395508, |
|
"learning_rate": 3.482536453034927e-05, |
|
"loss": 0.6068, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 29.32077407836914, |
|
"learning_rate": 3.4740590030518824e-05, |
|
"loss": 1.0515, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 21.01796531677246, |
|
"learning_rate": 3.465581553068837e-05, |
|
"loss": 0.8702, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.176788330078125, |
|
"learning_rate": 3.457104103085792e-05, |
|
"loss": 0.7961, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 33.97718048095703, |
|
"learning_rate": 3.448626653102747e-05, |
|
"loss": 0.6178, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.07225427776575089, |
|
"learning_rate": 3.440149203119702e-05, |
|
"loss": 0.7895, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.7126123905181885, |
|
"learning_rate": 3.4316717531366566e-05, |
|
"loss": 0.8879, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 60.54833221435547, |
|
"learning_rate": 3.423194303153612e-05, |
|
"loss": 0.8868, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 14.958369255065918, |
|
"learning_rate": 3.4147168531705664e-05, |
|
"loss": 0.9596, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.474311351776123, |
|
"learning_rate": 3.406239403187521e-05, |
|
"loss": 0.8218, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 5.398329734802246, |
|
"learning_rate": 3.397761953204476e-05, |
|
"loss": 0.7026, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 71.55179595947266, |
|
"learning_rate": 3.389284503221431e-05, |
|
"loss": 0.8114, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.973948001861572, |
|
"learning_rate": 3.380807053238386e-05, |
|
"loss": 0.4706, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.1762962341308594, |
|
"learning_rate": 3.3723296032553406e-05, |
|
"loss": 0.5073, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 38.53884506225586, |
|
"learning_rate": 3.363852153272296e-05, |
|
"loss": 0.6444, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 51.29215621948242, |
|
"learning_rate": 3.355374703289251e-05, |
|
"loss": 0.5418, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.8000295758247375, |
|
"learning_rate": 3.3468972533062056e-05, |
|
"loss": 0.558, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 33.07465744018555, |
|
"learning_rate": 3.338419803323161e-05, |
|
"loss": 1.4676, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 9.931634902954102, |
|
"learning_rate": 3.3299423533401155e-05, |
|
"loss": 1.3777, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 18.494661331176758, |
|
"learning_rate": 3.321464903357071e-05, |
|
"loss": 0.6258, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.16553539037704468, |
|
"learning_rate": 3.312987453374025e-05, |
|
"loss": 0.964, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 55.56105041503906, |
|
"learning_rate": 3.3045100033909805e-05, |
|
"loss": 0.4535, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 70.11732482910156, |
|
"learning_rate": 3.296032553407935e-05, |
|
"loss": 0.7698, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 58.0792121887207, |
|
"learning_rate": 3.2875551034248896e-05, |
|
"loss": 0.98, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 15.220126152038574, |
|
"learning_rate": 3.279077653441845e-05, |
|
"loss": 0.9123, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 13.152356147766113, |
|
"learning_rate": 3.2706002034587994e-05, |
|
"loss": 0.7913, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 37.108360290527344, |
|
"learning_rate": 3.262122753475755e-05, |
|
"loss": 0.6322, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 133.0155029296875, |
|
"learning_rate": 3.253645303492709e-05, |
|
"loss": 0.8002, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 5.423397541046143, |
|
"learning_rate": 3.2451678535096645e-05, |
|
"loss": 0.6649, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 37.421443939208984, |
|
"learning_rate": 3.236690403526619e-05, |
|
"loss": 0.4575, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 20.839557647705078, |
|
"learning_rate": 3.228212953543574e-05, |
|
"loss": 0.9595, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 8.633934020996094, |
|
"learning_rate": 3.2197355035605296e-05, |
|
"loss": 0.8809, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 27.769481658935547, |
|
"learning_rate": 3.211258053577484e-05, |
|
"loss": 0.9243, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.13876375555992126, |
|
"learning_rate": 3.2027806035944394e-05, |
|
"loss": 0.5838, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 50.39765548706055, |
|
"learning_rate": 3.194303153611394e-05, |
|
"loss": 0.4513, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 20.293073654174805, |
|
"learning_rate": 3.185825703628349e-05, |
|
"loss": 0.2965, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.49339720606803894, |
|
"learning_rate": 3.177348253645304e-05, |
|
"loss": 0.3329, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 8.260361671447754, |
|
"learning_rate": 3.168870803662258e-05, |
|
"loss": 0.8128, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.9407100677490234, |
|
"learning_rate": 3.1603933536792135e-05, |
|
"loss": 0.1281, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 51.073455810546875, |
|
"learning_rate": 3.151915903696168e-05, |
|
"loss": 0.7543, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.05772038921713829, |
|
"learning_rate": 3.1434384537131234e-05, |
|
"loss": 0.8038, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 5.772864818572998, |
|
"learning_rate": 3.134961003730078e-05, |
|
"loss": 1.0692, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 29.10246467590332, |
|
"learning_rate": 3.126483553747033e-05, |
|
"loss": 1.0786, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 74.27193450927734, |
|
"learning_rate": 3.118006103763988e-05, |
|
"loss": 0.3927, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 5.654672622680664, |
|
"learning_rate": 3.109528653780942e-05, |
|
"loss": 0.0392, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.030826814472675323, |
|
"learning_rate": 3.1010512037978975e-05, |
|
"loss": 0.5397, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.03806144371628761, |
|
"learning_rate": 3.092573753814853e-05, |
|
"loss": 0.5454, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.1753628253936768, |
|
"learning_rate": 3.084096303831808e-05, |
|
"loss": 0.7065, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 98.3718490600586, |
|
"learning_rate": 3.0756188538487626e-05, |
|
"loss": 0.5413, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 47.713836669921875, |
|
"learning_rate": 3.067141403865718e-05, |
|
"loss": 0.6828, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 5.499966144561768, |
|
"learning_rate": 3.0586639538826724e-05, |
|
"loss": 1.1348, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6247652769088745, |
|
"learning_rate": 3.050186503899627e-05, |
|
"loss": 0.6771, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.8232038021087646, |
|
"learning_rate": 3.0417090539165822e-05, |
|
"loss": 0.1384, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 16.047853469848633, |
|
"learning_rate": 3.0332316039335368e-05, |
|
"loss": 0.3357, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 22.86617660522461, |
|
"learning_rate": 3.024754153950492e-05, |
|
"loss": 0.1991, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 81.28910064697266, |
|
"learning_rate": 3.0162767039674466e-05, |
|
"loss": 0.4485, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2680492699146271, |
|
"learning_rate": 3.007799253984402e-05, |
|
"loss": 0.83, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.05300199240446091, |
|
"learning_rate": 2.9993218040013564e-05, |
|
"loss": 0.5507, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 73.55166625976562, |
|
"learning_rate": 2.9908443540183113e-05, |
|
"loss": 1.6387, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 50.422142028808594, |
|
"learning_rate": 2.9823669040352665e-05, |
|
"loss": 0.5293, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 25.55157470703125, |
|
"learning_rate": 2.973889454052221e-05, |
|
"loss": 1.0067, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.4120949506759644, |
|
"learning_rate": 2.9654120040691764e-05, |
|
"loss": 1.3009, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 15.355062484741211, |
|
"learning_rate": 2.956934554086131e-05, |
|
"loss": 0.843, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.4958977997303009, |
|
"learning_rate": 2.948457104103086e-05, |
|
"loss": 0.6706, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.02339070662856102, |
|
"learning_rate": 2.9399796541200407e-05, |
|
"loss": 0.5322, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 41.17433547973633, |
|
"learning_rate": 2.9315022041369956e-05, |
|
"loss": 0.5084, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.185633897781372, |
|
"learning_rate": 2.923024754153951e-05, |
|
"loss": 0.4755, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 53.46243667602539, |
|
"learning_rate": 2.9145473041709054e-05, |
|
"loss": 0.7402, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.203493118286133, |
|
"learning_rate": 2.9060698541878607e-05, |
|
"loss": 0.6368, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 8.445558547973633, |
|
"learning_rate": 2.8975924042048153e-05, |
|
"loss": 0.3933, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 44.631771087646484, |
|
"learning_rate": 2.8891149542217705e-05, |
|
"loss": 1.0651, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 26.084287643432617, |
|
"learning_rate": 2.880637504238725e-05, |
|
"loss": 1.0695, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2506965398788452, |
|
"learning_rate": 2.8721600542556796e-05, |
|
"loss": 0.2011, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 25.166454315185547, |
|
"learning_rate": 2.863682604272635e-05, |
|
"loss": 0.756, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 26.68798065185547, |
|
"learning_rate": 2.8552051542895898e-05, |
|
"loss": 1.0222, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 61.67727279663086, |
|
"learning_rate": 2.846727704306545e-05, |
|
"loss": 0.6143, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.026347124949097633, |
|
"learning_rate": 2.8382502543234996e-05, |
|
"loss": 0.6298, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 70.15167999267578, |
|
"learning_rate": 2.829772804340455e-05, |
|
"loss": 1.0136, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 75.8296127319336, |
|
"learning_rate": 2.8212953543574094e-05, |
|
"loss": 0.8159, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.20398695766925812, |
|
"learning_rate": 2.812817904374364e-05, |
|
"loss": 0.8516, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 95.35974884033203, |
|
"learning_rate": 2.8043404543913192e-05, |
|
"loss": 1.397, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.2767679691314697, |
|
"learning_rate": 2.795863004408274e-05, |
|
"loss": 0.1996, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 28.711078643798828, |
|
"learning_rate": 2.7873855544252294e-05, |
|
"loss": 0.6535, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 41.65683364868164, |
|
"learning_rate": 2.778908104442184e-05, |
|
"loss": 0.5639, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.6953363418579102, |
|
"learning_rate": 2.770430654459139e-05, |
|
"loss": 0.4564, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.09095999598503113, |
|
"learning_rate": 2.7619532044760937e-05, |
|
"loss": 0.4993, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 32.08039093017578, |
|
"learning_rate": 2.7534757544930483e-05, |
|
"loss": 0.9855, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.22207607328891754, |
|
"learning_rate": 2.7449983045100035e-05, |
|
"loss": 0.5325, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.050914715975522995, |
|
"learning_rate": 2.736520854526958e-05, |
|
"loss": 0.5084, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.8935670852661133, |
|
"learning_rate": 2.7280434045439134e-05, |
|
"loss": 0.6757, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 19.006797790527344, |
|
"learning_rate": 2.7195659545608683e-05, |
|
"loss": 0.6753, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 15.514384269714355, |
|
"learning_rate": 2.7110885045778235e-05, |
|
"loss": 0.4333, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 9.05042552947998, |
|
"learning_rate": 2.702611054594778e-05, |
|
"loss": 0.681, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 59.737754821777344, |
|
"learning_rate": 2.6941336046117326e-05, |
|
"loss": 1.1243, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 55.89622116088867, |
|
"learning_rate": 2.685656154628688e-05, |
|
"loss": 0.4998, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.33736708760261536, |
|
"learning_rate": 2.6771787046456424e-05, |
|
"loss": 0.5959, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 49.71003341674805, |
|
"learning_rate": 2.6687012546625977e-05, |
|
"loss": 0.869, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.18244197964668274, |
|
"learning_rate": 2.6602238046795526e-05, |
|
"loss": 0.5476, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 57.65730285644531, |
|
"learning_rate": 2.651746354696508e-05, |
|
"loss": 0.8371, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 8.13063907623291, |
|
"learning_rate": 2.6432689047134624e-05, |
|
"loss": 0.3608, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.2764989733695984, |
|
"learning_rate": 2.634791454730417e-05, |
|
"loss": 0.326, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.09723484516143799, |
|
"learning_rate": 2.6263140047473722e-05, |
|
"loss": 0.4903, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 90.19970703125, |
|
"learning_rate": 2.6178365547643268e-05, |
|
"loss": 0.5478, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.756451964378357, |
|
"learning_rate": 2.609359104781282e-05, |
|
"loss": 0.3005, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 95.1401596069336, |
|
"learning_rate": 2.600881654798237e-05, |
|
"loss": 0.3226, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.05358866602182388, |
|
"learning_rate": 2.5924042048151918e-05, |
|
"loss": 0.142, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 47.20892333984375, |
|
"learning_rate": 2.5839267548321467e-05, |
|
"loss": 0.8748, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.11273138970136642, |
|
"learning_rate": 2.5754493048491013e-05, |
|
"loss": 0.7466, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 11.685613632202148, |
|
"learning_rate": 2.5669718548660565e-05, |
|
"loss": 0.309, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.3904511332511902, |
|
"learning_rate": 2.558494404883011e-05, |
|
"loss": 0.4282, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.14008870720863342, |
|
"learning_rate": 2.5500169548999664e-05, |
|
"loss": 0.34, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 74.25190734863281, |
|
"learning_rate": 2.541539504916921e-05, |
|
"loss": 0.7807, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 50.943485260009766, |
|
"learning_rate": 2.533062054933876e-05, |
|
"loss": 1.5617, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.12540976703166962, |
|
"learning_rate": 2.524584604950831e-05, |
|
"loss": 0.5259, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 23.478382110595703, |
|
"learning_rate": 2.5161071549677856e-05, |
|
"loss": 0.6975, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.06014597788453102, |
|
"learning_rate": 2.507629704984741e-05, |
|
"loss": 0.4681, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.8615859150886536, |
|
"learning_rate": 2.4991522550016954e-05, |
|
"loss": 0.0536, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.489899218082428, |
|
"learning_rate": 2.4906748050186503e-05, |
|
"loss": 0.9952, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 4.645602226257324, |
|
"learning_rate": 2.4821973550356053e-05, |
|
"loss": 0.4834, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.08961528539657593, |
|
"learning_rate": 2.47371990505256e-05, |
|
"loss": 0.4156, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 39.562774658203125, |
|
"learning_rate": 2.4652424550695154e-05, |
|
"loss": 0.7731, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 41.32746124267578, |
|
"learning_rate": 2.4567650050864703e-05, |
|
"loss": 0.4151, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 19.19269561767578, |
|
"learning_rate": 2.4482875551034252e-05, |
|
"loss": 1.0375, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 7.726545333862305, |
|
"learning_rate": 2.4398101051203798e-05, |
|
"loss": 0.3077, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 12.059285163879395, |
|
"learning_rate": 2.4313326551373347e-05, |
|
"loss": 0.935, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 19.483482360839844, |
|
"learning_rate": 2.4228552051542896e-05, |
|
"loss": 0.2129, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.06684020906686783, |
|
"learning_rate": 2.4143777551712445e-05, |
|
"loss": 0.3169, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 44.86419677734375, |
|
"learning_rate": 2.4059003051881994e-05, |
|
"loss": 1.2605, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 6.09264612197876, |
|
"learning_rate": 2.3974228552051546e-05, |
|
"loss": 0.7057, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.04498284310102463, |
|
"learning_rate": 2.3889454052221095e-05, |
|
"loss": 0.9476, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 25.088247299194336, |
|
"learning_rate": 2.380467955239064e-05, |
|
"loss": 0.9267, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 45.36594772338867, |
|
"learning_rate": 2.371990505256019e-05, |
|
"loss": 0.7207, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.7207283973693848, |
|
"learning_rate": 2.363513055272974e-05, |
|
"loss": 0.7053, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 42.47288131713867, |
|
"learning_rate": 2.3550356052899288e-05, |
|
"loss": 0.2621, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 89.94295501708984, |
|
"learning_rate": 2.3465581553068837e-05, |
|
"loss": 0.328, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.10006561875343323, |
|
"learning_rate": 2.3380807053238386e-05, |
|
"loss": 0.8296, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 89.21922302246094, |
|
"learning_rate": 2.329603255340794e-05, |
|
"loss": 0.9074, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.014263429678976536, |
|
"learning_rate": 2.3211258053577484e-05, |
|
"loss": 0.5238, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.2213358879089355, |
|
"learning_rate": 2.3126483553747033e-05, |
|
"loss": 0.4163, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.10971381515264511, |
|
"learning_rate": 2.3041709053916582e-05, |
|
"loss": 0.4396, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.9370849132537842, |
|
"learning_rate": 2.295693455408613e-05, |
|
"loss": 0.9562, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.26476842164993286, |
|
"learning_rate": 2.287216005425568e-05, |
|
"loss": 0.3369, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 5.879600524902344, |
|
"learning_rate": 2.278738555442523e-05, |
|
"loss": 0.4717, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 91.1629867553711, |
|
"learning_rate": 2.270261105459478e-05, |
|
"loss": 0.4292, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.3384571075439453, |
|
"learning_rate": 2.2617836554764328e-05, |
|
"loss": 0.4184, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.09564901143312454, |
|
"learning_rate": 2.2533062054933877e-05, |
|
"loss": 0.3908, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.1722471714019775, |
|
"learning_rate": 2.2448287555103426e-05, |
|
"loss": 0.5939, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 86.56784057617188, |
|
"learning_rate": 2.2363513055272975e-05, |
|
"loss": 0.6253, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.4434572756290436, |
|
"learning_rate": 2.2278738555442524e-05, |
|
"loss": 0.2656, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.3613429069519043, |
|
"learning_rate": 2.2193964055612073e-05, |
|
"loss": 0.5376, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 39.03946304321289, |
|
"learning_rate": 2.2109189555781622e-05, |
|
"loss": 0.5256, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 33.69869613647461, |
|
"learning_rate": 2.202441505595117e-05, |
|
"loss": 1.9483, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.014940350316464901, |
|
"learning_rate": 2.193964055612072e-05, |
|
"loss": 0.2437, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5131822824478149, |
|
"learning_rate": 2.185486605629027e-05, |
|
"loss": 0.6613, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.8719220161437988, |
|
"learning_rate": 2.1770091556459818e-05, |
|
"loss": 1.1681, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.0348847322165966, |
|
"learning_rate": 2.1685317056629367e-05, |
|
"loss": 0.4557, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.1149950921535492, |
|
"learning_rate": 2.1600542556798916e-05, |
|
"loss": 0.2578, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 4.825486660003662, |
|
"learning_rate": 2.1515768056968465e-05, |
|
"loss": 0.5987, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 24.669727325439453, |
|
"learning_rate": 2.1430993557138014e-05, |
|
"loss": 0.4292, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.8084043264389038, |
|
"learning_rate": 2.1346219057307563e-05, |
|
"loss": 0.4102, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 73.08427429199219, |
|
"learning_rate": 2.1261444557477112e-05, |
|
"loss": 0.9116, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 30.825637817382812, |
|
"learning_rate": 2.117667005764666e-05, |
|
"loss": 0.5745, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 102.85230255126953, |
|
"learning_rate": 2.109189555781621e-05, |
|
"loss": 0.2618, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.03866774961352348, |
|
"learning_rate": 2.100712105798576e-05, |
|
"loss": 0.8069, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.1322760581970215, |
|
"learning_rate": 2.092234655815531e-05, |
|
"loss": 0.3481, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.014790826477110386, |
|
"learning_rate": 2.0837572058324854e-05, |
|
"loss": 0.4006, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.11268659681081772, |
|
"learning_rate": 2.0752797558494407e-05, |
|
"loss": 0.4533, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0567173957824707, |
|
"learning_rate": 2.0668023058663956e-05, |
|
"loss": 0.4536, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 8.341649055480957, |
|
"learning_rate": 2.0583248558833505e-05, |
|
"loss": 0.6103, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.268702745437622, |
|
"learning_rate": 2.0498474059003054e-05, |
|
"loss": 0.3091, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 8.767001152038574, |
|
"learning_rate": 2.0413699559172603e-05, |
|
"loss": 0.1458, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 109.38581848144531, |
|
"learning_rate": 2.0328925059342152e-05, |
|
"loss": 0.6205, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.01923774927854538, |
|
"learning_rate": 2.0244150559511698e-05, |
|
"loss": 0.5022, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 6.962080955505371, |
|
"learning_rate": 2.0159376059681247e-05, |
|
"loss": 0.0514, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.028890101239085197, |
|
"learning_rate": 2.00746015598508e-05, |
|
"loss": 1.4382, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.056723129004240036, |
|
"learning_rate": 1.9989827060020348e-05, |
|
"loss": 0.406, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.6163440942764282, |
|
"learning_rate": 1.9905052560189897e-05, |
|
"loss": 0.0346, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.029935460537672043, |
|
"learning_rate": 1.9820278060359446e-05, |
|
"loss": 0.3818, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.2080775797367096, |
|
"learning_rate": 1.9735503560528995e-05, |
|
"loss": 0.3822, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 26.54397201538086, |
|
"learning_rate": 1.965072906069854e-05, |
|
"loss": 0.3972, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.1706455647945404, |
|
"learning_rate": 1.956595456086809e-05, |
|
"loss": 0.8122, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 119.8738021850586, |
|
"learning_rate": 1.948118006103764e-05, |
|
"loss": 0.5582, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 5.742654323577881, |
|
"learning_rate": 1.939640556120719e-05, |
|
"loss": 0.6886, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 42.857357025146484, |
|
"learning_rate": 1.931163106137674e-05, |
|
"loss": 0.1316, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 44.45591735839844, |
|
"learning_rate": 1.922685656154629e-05, |
|
"loss": 0.7597, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.140688180923462, |
|
"learning_rate": 1.914208206171584e-05, |
|
"loss": 0.6811, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 14.241903305053711, |
|
"learning_rate": 1.9057307561885384e-05, |
|
"loss": 0.9585, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 75.1405029296875, |
|
"learning_rate": 1.8972533062054933e-05, |
|
"loss": 0.8658, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.019668349996209145, |
|
"learning_rate": 1.8887758562224482e-05, |
|
"loss": 0.2094, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 24.360675811767578, |
|
"learning_rate": 1.880298406239403e-05, |
|
"loss": 0.8209, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 58.54786682128906, |
|
"learning_rate": 1.8718209562563584e-05, |
|
"loss": 1.3557, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.059292688965797424, |
|
"learning_rate": 1.8633435062733133e-05, |
|
"loss": 0.4244, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 19.321937561035156, |
|
"learning_rate": 1.8548660562902682e-05, |
|
"loss": 0.4429, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 22.25551986694336, |
|
"learning_rate": 1.8463886063072228e-05, |
|
"loss": 0.2562, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 21.07433319091797, |
|
"learning_rate": 1.8379111563241777e-05, |
|
"loss": 0.4458, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.06258882582187653, |
|
"learning_rate": 1.8294337063411326e-05, |
|
"loss": 0.7059, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 97.818603515625, |
|
"learning_rate": 1.8209562563580875e-05, |
|
"loss": 1.6226, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 64.59729766845703, |
|
"learning_rate": 1.8124788063750424e-05, |
|
"loss": 0.7478, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 10.768301963806152, |
|
"learning_rate": 1.8040013563919976e-05, |
|
"loss": 0.3018, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 125.17046356201172, |
|
"learning_rate": 1.7955239064089525e-05, |
|
"loss": 0.5633, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 20.224864959716797, |
|
"learning_rate": 1.787046456425907e-05, |
|
"loss": 0.7404, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 43.46826171875, |
|
"learning_rate": 1.778569006442862e-05, |
|
"loss": 1.3683, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 26.397693634033203, |
|
"learning_rate": 1.770091556459817e-05, |
|
"loss": 0.7154, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 70.86721801757812, |
|
"learning_rate": 1.7616141064767718e-05, |
|
"loss": 0.3547, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 7.348357200622559, |
|
"learning_rate": 1.7531366564937267e-05, |
|
"loss": 0.3216, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.027426056563854218, |
|
"learning_rate": 1.7446592065106816e-05, |
|
"loss": 0.2455, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 16.937259674072266, |
|
"learning_rate": 1.736181756527637e-05, |
|
"loss": 0.467, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8604963421821594, |
|
"learning_rate": 1.7277043065445914e-05, |
|
"loss": 0.4664, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 76.17068481445312, |
|
"learning_rate": 1.7192268565615463e-05, |
|
"loss": 0.1975, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 11.509053230285645, |
|
"learning_rate": 1.7107494065785012e-05, |
|
"loss": 0.6729, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.039980944246053696, |
|
"learning_rate": 1.702271956595456e-05, |
|
"loss": 0.5038, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.07167255878448486, |
|
"learning_rate": 1.693794506612411e-05, |
|
"loss": 0.1302, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.1734821796417236, |
|
"learning_rate": 1.685317056629366e-05, |
|
"loss": 0.2821, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.076037883758545, |
|
"learning_rate": 1.676839606646321e-05, |
|
"loss": 0.55, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.02169458009302616, |
|
"learning_rate": 1.6683621566632758e-05, |
|
"loss": 0.0048, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.759131669998169, |
|
"learning_rate": 1.6598847066802307e-05, |
|
"loss": 0.8726, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.5331248641014099, |
|
"learning_rate": 1.6514072566971856e-05, |
|
"loss": 0.6762, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 22.186798095703125, |
|
"learning_rate": 1.6429298067141405e-05, |
|
"loss": 0.6426, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 32.538089752197266, |
|
"learning_rate": 1.6344523567310954e-05, |
|
"loss": 0.9122, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 6.58526086807251, |
|
"learning_rate": 1.6259749067480503e-05, |
|
"loss": 0.3097, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.05092554911971092, |
|
"learning_rate": 1.6174974567650052e-05, |
|
"loss": 0.0069, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 41.92851638793945, |
|
"learning_rate": 1.60902000678196e-05, |
|
"loss": 0.9276, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 152.6699676513672, |
|
"learning_rate": 1.600542556798915e-05, |
|
"loss": 0.4752, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 41.94477462768555, |
|
"learning_rate": 1.59206510681587e-05, |
|
"loss": 0.2672, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 57.44374084472656, |
|
"learning_rate": 1.5835876568328248e-05, |
|
"loss": 0.9764, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 23.202451705932617, |
|
"learning_rate": 1.5751102068497797e-05, |
|
"loss": 0.4835, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 8.41569709777832, |
|
"learning_rate": 1.5666327568667346e-05, |
|
"loss": 0.3972, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.2448299527168274, |
|
"learning_rate": 1.5581553068836892e-05, |
|
"loss": 0.3595, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.009353713132441044, |
|
"learning_rate": 1.5496778569006444e-05, |
|
"loss": 0.6254, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.40498995780944824, |
|
"learning_rate": 1.5412004069175993e-05, |
|
"loss": 0.4135, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.1234465166926384, |
|
"learning_rate": 1.5327229569345542e-05, |
|
"loss": 0.7479, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 15.938213348388672, |
|
"learning_rate": 1.5242455069515091e-05, |
|
"loss": 0.9297, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.941218376159668, |
|
"learning_rate": 1.515768056968464e-05, |
|
"loss": 1.0416, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 5.810397148132324, |
|
"learning_rate": 1.507290606985419e-05, |
|
"loss": 0.2707, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.005552912130951881, |
|
"learning_rate": 1.4988131570023737e-05, |
|
"loss": 0.6025, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.059763405472040176, |
|
"learning_rate": 1.4903357070193286e-05, |
|
"loss": 0.018, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.09440121799707413, |
|
"learning_rate": 1.4818582570362835e-05, |
|
"loss": 0.3638, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.17027807235717773, |
|
"learning_rate": 1.4733808070532384e-05, |
|
"loss": 0.3194, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.010031620971858501, |
|
"learning_rate": 1.4649033570701933e-05, |
|
"loss": 0.1356, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.03452069312334061, |
|
"learning_rate": 1.4564259070871484e-05, |
|
"loss": 0.4853, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.010452840477228165, |
|
"learning_rate": 1.4479484571041033e-05, |
|
"loss": 0.5158, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 9.103416442871094, |
|
"learning_rate": 1.4394710071210579e-05, |
|
"loss": 0.9832, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 26.68756103515625, |
|
"learning_rate": 1.430993557138013e-05, |
|
"loss": 0.8755, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 165.71353149414062, |
|
"learning_rate": 1.4225161071549678e-05, |
|
"loss": 1.1972, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 47.88841247558594, |
|
"learning_rate": 1.4140386571719227e-05, |
|
"loss": 0.335, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.27552178502082825, |
|
"learning_rate": 1.4055612071888776e-05, |
|
"loss": 0.9269, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 18.1062068939209, |
|
"learning_rate": 1.3970837572058325e-05, |
|
"loss": 0.2469, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.01228923350572586, |
|
"learning_rate": 1.3886063072227876e-05, |
|
"loss": 0.4695, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.010787318460643291, |
|
"learning_rate": 1.3801288572397422e-05, |
|
"loss": 0.0921, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.013667989522218704, |
|
"learning_rate": 1.3716514072566971e-05, |
|
"loss": 0.136, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.07841726392507553, |
|
"learning_rate": 1.3631739572736522e-05, |
|
"loss": 0.3449, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.020194262266159058, |
|
"learning_rate": 1.354696507290607e-05, |
|
"loss": 0.1784, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 69.3423843383789, |
|
"learning_rate": 1.346219057307562e-05, |
|
"loss": 0.6792, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.046759914606809616, |
|
"learning_rate": 1.3377416073245169e-05, |
|
"loss": 0.1271, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.011114859953522682, |
|
"learning_rate": 1.3292641573414718e-05, |
|
"loss": 0.2728, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 32.39384841918945, |
|
"learning_rate": 1.3207867073584265e-05, |
|
"loss": 0.337, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 45.9520263671875, |
|
"learning_rate": 1.3123092573753814e-05, |
|
"loss": 0.3638, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.45650675892829895, |
|
"learning_rate": 1.3038318073923363e-05, |
|
"loss": 0.4145, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.05143202096223831, |
|
"learning_rate": 1.2953543574092914e-05, |
|
"loss": 1.0756, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 6.376939296722412, |
|
"learning_rate": 1.2868769074262463e-05, |
|
"loss": 0.0657, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 25.659818649291992, |
|
"learning_rate": 1.2783994574432012e-05, |
|
"loss": 0.1626, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.00974774919450283, |
|
"learning_rate": 1.2699220074601561e-05, |
|
"loss": 0.136, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.016076885163784027, |
|
"learning_rate": 1.2614445574771109e-05, |
|
"loss": 0.6645, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.16606609523296356, |
|
"learning_rate": 1.2529671074940658e-05, |
|
"loss": 0.7139, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 70.56269836425781, |
|
"learning_rate": 1.2444896575110207e-05, |
|
"loss": 0.0413, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.15991821885108948, |
|
"learning_rate": 1.2360122075279756e-05, |
|
"loss": 0.8144, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.9377095699310303, |
|
"learning_rate": 1.2275347575449306e-05, |
|
"loss": 0.489, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.03479115292429924, |
|
"learning_rate": 1.2190573075618854e-05, |
|
"loss": 0.7886, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.03444279730319977, |
|
"learning_rate": 1.2105798575788403e-05, |
|
"loss": 0.5328, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.0390845350921154, |
|
"learning_rate": 1.2021024075957952e-05, |
|
"loss": 0.4317, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.3982301652431488, |
|
"learning_rate": 1.1936249576127503e-05, |
|
"loss": 1.1167, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.10968449711799622, |
|
"learning_rate": 1.185147507629705e-05, |
|
"loss": 0.4359, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.02797958254814148, |
|
"learning_rate": 1.1766700576466599e-05, |
|
"loss": 0.0039, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.00903019867837429, |
|
"learning_rate": 1.1681926076636148e-05, |
|
"loss": 0.3334, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.15302762389183044, |
|
"learning_rate": 1.1597151576805697e-05, |
|
"loss": 0.4842, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.855728030204773, |
|
"learning_rate": 1.1512377076975246e-05, |
|
"loss": 0.0593, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.005731168668717146, |
|
"learning_rate": 1.1427602577144795e-05, |
|
"loss": 0.2487, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.14197255671024323, |
|
"learning_rate": 1.1342828077314344e-05, |
|
"loss": 0.3685, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 28.626758575439453, |
|
"learning_rate": 1.1258053577483893e-05, |
|
"loss": 0.1837, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.06634432077407837, |
|
"learning_rate": 1.1173279077653442e-05, |
|
"loss": 0.9408, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 57.522308349609375, |
|
"learning_rate": 1.1088504577822991e-05, |
|
"loss": 0.8385, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.012267855927348137, |
|
"learning_rate": 1.100373007799254e-05, |
|
"loss": 0.3534, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.008047958835959435, |
|
"learning_rate": 1.091895557816209e-05, |
|
"loss": 0.1124, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.02493928372859955, |
|
"learning_rate": 1.0834181078331639e-05, |
|
"loss": 0.9354, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 46.4794921875, |
|
"learning_rate": 1.0749406578501188e-05, |
|
"loss": 0.6517, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.6745705604553223, |
|
"learning_rate": 1.0664632078670737e-05, |
|
"loss": 0.2617, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 52.141422271728516, |
|
"learning_rate": 1.0579857578840286e-05, |
|
"loss": 1.0205, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.015956485643982887, |
|
"learning_rate": 1.0495083079009835e-05, |
|
"loss": 0.0031, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.00936069618910551, |
|
"learning_rate": 1.0410308579179382e-05, |
|
"loss": 0.4062, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.8617632389068604, |
|
"learning_rate": 1.0325534079348933e-05, |
|
"loss": 0.6192, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 44.38825988769531, |
|
"learning_rate": 1.0240759579518482e-05, |
|
"loss": 0.9206, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.07197415083646774, |
|
"learning_rate": 1.0155985079688031e-05, |
|
"loss": 0.5991, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.07943376153707504, |
|
"learning_rate": 1.0071210579857578e-05, |
|
"loss": 0.7726, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.014867642894387245, |
|
"learning_rate": 9.986436080027129e-06, |
|
"loss": 0.4536, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.3331596851348877, |
|
"learning_rate": 9.901661580196678e-06, |
|
"loss": 0.1273, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.49268820881843567, |
|
"learning_rate": 9.816887080366225e-06, |
|
"loss": 0.1124, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 50.03787612915039, |
|
"learning_rate": 9.732112580535774e-06, |
|
"loss": 0.403, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 65.66496276855469, |
|
"learning_rate": 9.647338080705325e-06, |
|
"loss": 0.3994, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 21.710723876953125, |
|
"learning_rate": 9.562563580874874e-06, |
|
"loss": 0.2621, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.0246584489941597, |
|
"learning_rate": 9.477789081044422e-06, |
|
"loss": 0.4591, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 60.308345794677734, |
|
"learning_rate": 9.39301458121397e-06, |
|
"loss": 0.0449, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 136.57911682128906, |
|
"learning_rate": 9.308240081383521e-06, |
|
"loss": 0.6523, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.011092737317085266, |
|
"learning_rate": 9.223465581553069e-06, |
|
"loss": 0.5621, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 72.57099151611328, |
|
"learning_rate": 9.138691081722618e-06, |
|
"loss": 0.3213, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.5914633274078369, |
|
"learning_rate": 9.053916581892167e-06, |
|
"loss": 0.5433, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.004101056605577469, |
|
"learning_rate": 8.969142082061716e-06, |
|
"loss": 0.1477, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.14881229400634766, |
|
"learning_rate": 8.884367582231265e-06, |
|
"loss": 0.3333, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.2352166622877121, |
|
"learning_rate": 8.799593082400814e-06, |
|
"loss": 0.474, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.024752633646130562, |
|
"learning_rate": 8.714818582570363e-06, |
|
"loss": 0.4311, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.5896317958831787, |
|
"learning_rate": 8.630044082739912e-06, |
|
"loss": 0.5513, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.06998773664236069, |
|
"learning_rate": 8.545269582909461e-06, |
|
"loss": 0.2412, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.060008566826581955, |
|
"learning_rate": 8.46049508307901e-06, |
|
"loss": 0.541, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 130.00119018554688, |
|
"learning_rate": 8.37572058324856e-06, |
|
"loss": 0.4602, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 114.44886779785156, |
|
"learning_rate": 8.290946083418108e-06, |
|
"loss": 0.594, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.1713523268699646, |
|
"learning_rate": 8.206171583587657e-06, |
|
"loss": 0.0291, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.02883375808596611, |
|
"learning_rate": 8.121397083757206e-06, |
|
"loss": 0.1052, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.028804084286093712, |
|
"learning_rate": 8.036622583926755e-06, |
|
"loss": 0.4604, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 3.535491704940796, |
|
"learning_rate": 7.951848084096304e-06, |
|
"loss": 0.3482, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.026217030361294746, |
|
"learning_rate": 7.867073584265853e-06, |
|
"loss": 0.2586, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.3666705787181854, |
|
"learning_rate": 7.7822990844354e-06, |
|
"loss": 0.9029, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.017206447198987007, |
|
"learning_rate": 7.697524584604952e-06, |
|
"loss": 0.3287, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.011142526753246784, |
|
"learning_rate": 7.612750084774501e-06, |
|
"loss": 0.1899, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.032814279198646545, |
|
"learning_rate": 7.52797558494405e-06, |
|
"loss": 0.6027, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.121613502502441, |
|
"learning_rate": 7.443201085113598e-06, |
|
"loss": 0.4039, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.02050989866256714, |
|
"learning_rate": 7.358426585283147e-06, |
|
"loss": 1.1173, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 31.819725036621094, |
|
"learning_rate": 7.273652085452697e-06, |
|
"loss": 0.4358, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 62.192996978759766, |
|
"learning_rate": 7.188877585622244e-06, |
|
"loss": 0.2461, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 4.339864730834961, |
|
"learning_rate": 7.104103085791794e-06, |
|
"loss": 0.2663, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.04583854600787163, |
|
"learning_rate": 7.019328585961343e-06, |
|
"loss": 0.0101, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 30.54637336730957, |
|
"learning_rate": 6.934554086130893e-06, |
|
"loss": 0.6777, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 8.97301959991455, |
|
"learning_rate": 6.84977958630044e-06, |
|
"loss": 0.4932, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.008320887573063374, |
|
"learning_rate": 6.76500508646999e-06, |
|
"loss": 0.4379, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.00964405108243227, |
|
"learning_rate": 6.680230586639539e-06, |
|
"loss": 0.5022, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.38759058713912964, |
|
"learning_rate": 6.5954560868090875e-06, |
|
"loss": 0.26, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.011307528242468834, |
|
"learning_rate": 6.5106815869786366e-06, |
|
"loss": 0.1178, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.06669808179140091, |
|
"learning_rate": 6.4259070871481864e-06, |
|
"loss": 0.3514, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 130.25222778320312, |
|
"learning_rate": 6.3411325873177355e-06, |
|
"loss": 0.0678, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.02906797081232071, |
|
"learning_rate": 6.256358087487284e-06, |
|
"loss": 0.9587, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.06155577301979065, |
|
"learning_rate": 6.171583587656833e-06, |
|
"loss": 0.3508, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.007107898592948914, |
|
"learning_rate": 6.086809087826382e-06, |
|
"loss": 0.2734, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 72.81842803955078, |
|
"learning_rate": 6.002034587995931e-06, |
|
"loss": 0.2777, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.008903439156711102, |
|
"learning_rate": 5.91726008816548e-06, |
|
"loss": 0.3983, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.018965894356369972, |
|
"learning_rate": 5.832485588335029e-06, |
|
"loss": 0.1067, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.039035771042108536, |
|
"learning_rate": 5.747711088504578e-06, |
|
"loss": 0.7816, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 190.41064453125, |
|
"learning_rate": 5.662936588674127e-06, |
|
"loss": 0.3051, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.00861747283488512, |
|
"learning_rate": 5.578162088843676e-06, |
|
"loss": 0.0098, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 9.695987701416016, |
|
"learning_rate": 5.493387589013225e-06, |
|
"loss": 0.0252, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.0687270537018776, |
|
"learning_rate": 5.408613089182774e-06, |
|
"loss": 0.1273, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.01117786020040512, |
|
"learning_rate": 5.323838589352323e-06, |
|
"loss": 0.8719, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.017118388786911964, |
|
"learning_rate": 5.239064089521872e-06, |
|
"loss": 0.8036, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.834500789642334, |
|
"learning_rate": 5.154289589691421e-06, |
|
"loss": 0.6443, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.03857054561376572, |
|
"learning_rate": 5.0695150898609695e-06, |
|
"loss": 0.3097, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.3266679048538208, |
|
"learning_rate": 4.984740590030519e-06, |
|
"loss": 0.5779, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.008399288170039654, |
|
"learning_rate": 4.899966090200068e-06, |
|
"loss": 0.3936, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 48.30754852294922, |
|
"learning_rate": 4.8151915903696175e-06, |
|
"loss": 0.747, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.35246542096138, |
|
"learning_rate": 4.730417090539166e-06, |
|
"loss": 0.1861, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.10090816020965576, |
|
"learning_rate": 4.645642590708716e-06, |
|
"loss": 0.2213, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.081159830093384, |
|
"learning_rate": 4.560868090878264e-06, |
|
"loss": 0.3135, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 88.7223129272461, |
|
"learning_rate": 4.476093591047813e-06, |
|
"loss": 0.7422, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.028826922178268433, |
|
"learning_rate": 4.391319091217362e-06, |
|
"loss": 0.1471, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.006259507965296507, |
|
"learning_rate": 4.306544591386911e-06, |
|
"loss": 0.0344, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.010470997542142868, |
|
"learning_rate": 4.22177009155646e-06, |
|
"loss": 0.0023, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.0066988361068069935, |
|
"learning_rate": 4.136995591726009e-06, |
|
"loss": 0.0013, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.004910880699753761, |
|
"learning_rate": 4.052221091895558e-06, |
|
"loss": 0.1276, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 115.09222412109375, |
|
"learning_rate": 3.967446592065107e-06, |
|
"loss": 0.4087, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.25051459670066833, |
|
"learning_rate": 3.882672092234655e-06, |
|
"loss": 0.0014, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 6.038802146911621, |
|
"learning_rate": 3.7978975924042052e-06, |
|
"loss": 0.01, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.012906698510050774, |
|
"learning_rate": 3.713123092573754e-06, |
|
"loss": 0.0645, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 191.30227661132812, |
|
"learning_rate": 3.6283485927433033e-06, |
|
"loss": 0.4189, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 136.50938415527344, |
|
"learning_rate": 3.543574092912852e-06, |
|
"loss": 0.1618, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.011031740345060825, |
|
"learning_rate": 3.4587995930824014e-06, |
|
"loss": 0.0745, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 25.427879333496094, |
|
"learning_rate": 3.37402509325195e-06, |
|
"loss": 0.6259, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.011854914017021656, |
|
"learning_rate": 3.2892505934214987e-06, |
|
"loss": 0.0394, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 35.059471130371094, |
|
"learning_rate": 3.204476093591048e-06, |
|
"loss": 0.4307, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.061668381094932556, |
|
"learning_rate": 3.1197015937605968e-06, |
|
"loss": 0.3687, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.013285267166793346, |
|
"learning_rate": 3.034927093930146e-06, |
|
"loss": 0.9381, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 64.50102233886719, |
|
"learning_rate": 2.950152594099695e-06, |
|
"loss": 0.97, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.057166144251823425, |
|
"learning_rate": 2.865378094269244e-06, |
|
"loss": 0.0254, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 69.94447326660156, |
|
"learning_rate": 2.780603594438793e-06, |
|
"loss": 0.861, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 157.21966552734375, |
|
"learning_rate": 2.695829094608342e-06, |
|
"loss": 0.1439, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.007634480483829975, |
|
"learning_rate": 2.611054594777891e-06, |
|
"loss": 0.5199, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.01918291673064232, |
|
"learning_rate": 2.5262800949474397e-06, |
|
"loss": 1.0748, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.016131578013300896, |
|
"learning_rate": 2.4415055951169887e-06, |
|
"loss": 0.0686, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.016394583508372307, |
|
"learning_rate": 2.3567310952865378e-06, |
|
"loss": 0.3871, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.004754968453198671, |
|
"learning_rate": 2.271956595456087e-06, |
|
"loss": 0.187, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 56.73072814941406, |
|
"learning_rate": 2.187182095625636e-06, |
|
"loss": 0.4921, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.08189795911312103, |
|
"learning_rate": 2.102407595795185e-06, |
|
"loss": 0.2838, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 61.22605514526367, |
|
"learning_rate": 2.017633095964734e-06, |
|
"loss": 0.4503, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.1310381144285202, |
|
"learning_rate": 1.932858596134283e-06, |
|
"loss": 0.8475, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 8.494524002075195, |
|
"learning_rate": 1.8480840963038318e-06, |
|
"loss": 0.4413, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.06928512454032898, |
|
"learning_rate": 1.7633095964733809e-06, |
|
"loss": 0.3415, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 143.20860290527344, |
|
"learning_rate": 1.67853509664293e-06, |
|
"loss": 0.742, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.0477374829351902, |
|
"learning_rate": 1.593760596812479e-06, |
|
"loss": 0.2915, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.10252419114112854, |
|
"learning_rate": 1.5089860969820278e-06, |
|
"loss": 0.1603, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 96.32301330566406, |
|
"learning_rate": 1.4242115971515769e-06, |
|
"loss": 0.7714, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.06277238577604294, |
|
"learning_rate": 1.339437097321126e-06, |
|
"loss": 0.0346, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.02291535772383213, |
|
"learning_rate": 1.254662597490675e-06, |
|
"loss": 0.4468, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 137.12705993652344, |
|
"learning_rate": 1.169888097660224e-06, |
|
"loss": 0.2153, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.162826269865036, |
|
"learning_rate": 1.085113597829773e-06, |
|
"loss": 0.0054, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.6479134559631348, |
|
"learning_rate": 1.0003390979993219e-06, |
|
"loss": 0.3338, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 4.235650539398193, |
|
"learning_rate": 9.155645981688708e-07, |
|
"loss": 0.4362, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.006349239964038134, |
|
"learning_rate": 8.307900983384199e-07, |
|
"loss": 0.3818, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.03322937712073326, |
|
"learning_rate": 7.460155985079689e-07, |
|
"loss": 0.3669, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 53.04838562011719, |
|
"learning_rate": 6.612410986775179e-07, |
|
"loss": 0.5879, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.029276695102453232, |
|
"learning_rate": 5.764665988470668e-07, |
|
"loss": 0.3514, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.07442772388458252, |
|
"learning_rate": 4.916920990166158e-07, |
|
"loss": 0.5732, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.02505446970462799, |
|
"learning_rate": 4.069175991861648e-07, |
|
"loss": 0.7, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 159.9132843017578, |
|
"learning_rate": 3.2214309935571383e-07, |
|
"loss": 0.4448, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.02801443822681904, |
|
"learning_rate": 2.3736859952526283e-07, |
|
"loss": 0.0937, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.011555209755897522, |
|
"learning_rate": 1.525940996948118e-07, |
|
"loss": 0.2084, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 67.56371307373047, |
|
"learning_rate": 6.781959986436082e-08, |
|
"loss": 0.2189, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.9062329141607436, |
|
"eval_loss": 0.3474305272102356, |
|
"eval_runtime": 3127.9579, |
|
"eval_samples_per_second": 1.169, |
|
"eval_steps_per_second": 0.585, |
|
"step": 6398 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 6398, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 3.278594912567663e+19, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|