|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1167, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02570694087403599, |
|
"grad_norm": 5.865927219390869, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9141, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05141388174807198, |
|
"grad_norm": 1.573315978050232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7782, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07712082262210797, |
|
"grad_norm": 1.0980241298675537, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7291, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10282776349614396, |
|
"grad_norm": 0.8817281723022461, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7015, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12853470437017994, |
|
"grad_norm": 1.167167067527771, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6798, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15424164524421594, |
|
"grad_norm": 2.2831735610961914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6661, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17994858611825193, |
|
"grad_norm": 0.8653743863105774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6524, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20565552699228792, |
|
"grad_norm": 0.9545003175735474, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6527, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.23136246786632392, |
|
"grad_norm": 0.9084785580635071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6451, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2570694087403599, |
|
"grad_norm": 0.5454579591751099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6408, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2827763496143959, |
|
"grad_norm": 0.5495653748512268, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6224, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.30848329048843187, |
|
"grad_norm": 0.6504859328269958, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6313, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3341902313624679, |
|
"grad_norm": 0.828336238861084, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6324, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.35989717223650386, |
|
"grad_norm": 0.5564887523651123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6231, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3856041131105398, |
|
"grad_norm": 0.7280133962631226, |
|
"learning_rate": 5e-06, |
|
"loss": 0.617, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.41131105398457585, |
|
"grad_norm": 0.5732916593551636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6161, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4370179948586118, |
|
"grad_norm": 0.5130185484886169, |
|
"learning_rate": 5e-06, |
|
"loss": 0.616, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.46272493573264784, |
|
"grad_norm": 1.034699559211731, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6186, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4884318766066838, |
|
"grad_norm": 0.6324091553688049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6078, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5141388174807198, |
|
"grad_norm": 0.5066195726394653, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6054, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5398457583547558, |
|
"grad_norm": 0.5803439021110535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6103, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5655526992287918, |
|
"grad_norm": 0.49449267983436584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6072, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5912596401028277, |
|
"grad_norm": 0.553966760635376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6012, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6169665809768637, |
|
"grad_norm": 0.5949456095695496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5979, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6426735218508998, |
|
"grad_norm": 0.5329740047454834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5985, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6683804627249358, |
|
"grad_norm": 0.6510268449783325, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5938, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6940874035989717, |
|
"grad_norm": 0.4840281009674072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6043, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7197943444730077, |
|
"grad_norm": 0.9149703979492188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5969, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7455012853470437, |
|
"grad_norm": 0.6490294933319092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5942, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7712082262210797, |
|
"grad_norm": 0.6224178671836853, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5936, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7969151670951157, |
|
"grad_norm": 0.5585442185401917, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5903, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8226221079691517, |
|
"grad_norm": 0.5131608247756958, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5864, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8483290488431876, |
|
"grad_norm": 0.7199530005455017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5891, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8740359897172236, |
|
"grad_norm": 0.5370771884918213, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5793, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8997429305912596, |
|
"grad_norm": 0.7190908193588257, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5908, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9254498714652957, |
|
"grad_norm": 0.5482605695724487, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5825, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9511568123393316, |
|
"grad_norm": 0.5382151007652283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.585, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9768637532133676, |
|
"grad_norm": 0.5443570017814636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5859, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.5778310894966125, |
|
"eval_runtime": 35.0525, |
|
"eval_samples_per_second": 299.038, |
|
"eval_steps_per_second": 1.17, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.0025706940874035, |
|
"grad_norm": 0.8967987298965454, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5767, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0282776349614395, |
|
"grad_norm": 0.5881332159042358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.546, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0539845758354756, |
|
"grad_norm": 0.7526522278785706, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5478, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0796915167095116, |
|
"grad_norm": 0.5316684246063232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5436, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1053984575835476, |
|
"grad_norm": 0.49085283279418945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5406, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1311053984575836, |
|
"grad_norm": 0.670981228351593, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5446, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1568123393316196, |
|
"grad_norm": 0.5618010759353638, |
|
"learning_rate": 5e-06, |
|
"loss": 0.545, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1825192802056554, |
|
"grad_norm": 0.6097432971000671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5405, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2082262210796915, |
|
"grad_norm": 0.5328357815742493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5434, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2339331619537275, |
|
"grad_norm": 0.5351125597953796, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5458, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2596401028277635, |
|
"grad_norm": 0.7780609130859375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5419, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2853470437017995, |
|
"grad_norm": 0.5531721115112305, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5399, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3110539845758356, |
|
"grad_norm": 0.7177889943122864, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5429, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3367609254498714, |
|
"grad_norm": 0.6722058653831482, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5372, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3624678663239074, |
|
"grad_norm": 0.515562117099762, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5335, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3881748071979434, |
|
"grad_norm": 0.6760542392730713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5361, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4138817480719794, |
|
"grad_norm": 0.7421192526817322, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5399, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4395886889460154, |
|
"grad_norm": 0.5604066848754883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5426, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.4652956298200515, |
|
"grad_norm": 0.5541772246360779, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5392, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.4910025706940875, |
|
"grad_norm": 0.5291936993598938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5355, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5167095115681235, |
|
"grad_norm": 0.5513972043991089, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5388, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5424164524421595, |
|
"grad_norm": 0.5410069227218628, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5351, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5681233933161953, |
|
"grad_norm": 0.5600557327270508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5368, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.5938303341902313, |
|
"grad_norm": 0.5075570344924927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5274, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6195372750642674, |
|
"grad_norm": 0.5874601006507874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5335, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.6452442159383034, |
|
"grad_norm": 0.5573471188545227, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5282, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6709511568123392, |
|
"grad_norm": 0.7181922793388367, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5357, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6966580976863752, |
|
"grad_norm": 0.5899803638458252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5281, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.7223650385604112, |
|
"grad_norm": 0.5040240287780762, |
|
"learning_rate": 5e-06, |
|
"loss": 0.532, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.7480719794344473, |
|
"grad_norm": 0.5562026500701904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5356, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7737789203084833, |
|
"grad_norm": 0.5617057681083679, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5275, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.7994858611825193, |
|
"grad_norm": 0.5864099860191345, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5357, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8251928020565553, |
|
"grad_norm": 0.6038815975189209, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5279, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.8508997429305913, |
|
"grad_norm": 0.4948534667491913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5288, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8766066838046274, |
|
"grad_norm": 0.4712887406349182, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5292, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.9023136246786634, |
|
"grad_norm": 0.5233219265937805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5271, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.9280205655526992, |
|
"grad_norm": 0.7113652229309082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5277, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.9537275064267352, |
|
"grad_norm": 0.5814751982688904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5268, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9794344473007712, |
|
"grad_norm": 0.530799388885498, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5297, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.5520427227020264, |
|
"eval_runtime": 36.0966, |
|
"eval_samples_per_second": 290.388, |
|
"eval_steps_per_second": 1.136, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 2.005141388174807, |
|
"grad_norm": 0.7985607981681824, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5175, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.030848329048843, |
|
"grad_norm": 0.6567503809928894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4906, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.056555269922879, |
|
"grad_norm": 0.5952147841453552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4893, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.082262210796915, |
|
"grad_norm": 0.6156149506568909, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4989, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.107969151670951, |
|
"grad_norm": 0.48527252674102783, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4874, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.133676092544987, |
|
"grad_norm": 0.546338677406311, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4944, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.159383033419023, |
|
"grad_norm": 0.5232033133506775, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4871, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.185089974293059, |
|
"grad_norm": 0.5629530549049377, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4928, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.210796915167095, |
|
"grad_norm": 0.6175139546394348, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4909, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.236503856041131, |
|
"grad_norm": 0.5951898097991943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.489, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.2622107969151672, |
|
"grad_norm": 0.57947838306427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4838, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.2879177377892033, |
|
"grad_norm": 0.6488301753997803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4892, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.3136246786632393, |
|
"grad_norm": 0.6758255362510681, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4958, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.3393316195372753, |
|
"grad_norm": 0.5344276428222656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4904, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.365038560411311, |
|
"grad_norm": 0.560171365737915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4949, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.390745501285347, |
|
"grad_norm": 0.6933562159538269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4941, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.416452442159383, |
|
"grad_norm": 0.5910319089889526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4943, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.442159383033419, |
|
"grad_norm": 0.5552846789360046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4924, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.467866323907455, |
|
"grad_norm": 0.4899492859840393, |
|
"learning_rate": 5e-06, |
|
"loss": 0.486, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.493573264781491, |
|
"grad_norm": 0.6860612630844116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4907, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.519280205655527, |
|
"grad_norm": 0.5359469652175903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.496, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.544987146529563, |
|
"grad_norm": 0.49907293915748596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4927, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.570694087403599, |
|
"grad_norm": 0.5387750864028931, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4859, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.596401028277635, |
|
"grad_norm": 0.5668651461601257, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4908, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.622107969151671, |
|
"grad_norm": 0.5228598117828369, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4894, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.6478149100257067, |
|
"grad_norm": 0.5581383109092712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4913, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.6735218508997427, |
|
"grad_norm": 0.5759613513946533, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4883, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.6992287917737787, |
|
"grad_norm": 0.6597785949707031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4852, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.7249357326478147, |
|
"grad_norm": 0.572317361831665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4875, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.7506426735218508, |
|
"grad_norm": 0.48137277364730835, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4862, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.776349614395887, |
|
"grad_norm": 0.5613889694213867, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4866, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.802056555269923, |
|
"grad_norm": 0.5657880902290344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4882, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.827763496143959, |
|
"grad_norm": 0.48773303627967834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4857, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.853470437017995, |
|
"grad_norm": 0.5101090669631958, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4844, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.879177377892031, |
|
"grad_norm": 0.5357456207275391, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4877, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.904884318766067, |
|
"grad_norm": 0.49460962414741516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4853, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.930591259640103, |
|
"grad_norm": 0.5633171796798706, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4856, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.956298200514139, |
|
"grad_norm": 0.5715927481651306, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4866, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.982005141388175, |
|
"grad_norm": 0.5925987958908081, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4908, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.5453311800956726, |
|
"eval_runtime": 35.1038, |
|
"eval_samples_per_second": 298.6, |
|
"eval_steps_per_second": 1.168, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1167, |
|
"total_flos": 5.5102128176360325e+19, |
|
"train_loss": 0.5515584751593321, |
|
"train_runtime": 8384.6233, |
|
"train_samples_per_second": 71.254, |
|
"train_steps_per_second": 0.139 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1167, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.5102128176360325e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|