|
{ |
|
"best_metric": 5.700723648071289, |
|
"best_model_checkpoint": "./results/models/checkpoint-101465", |
|
"epoch": 13.0, |
|
"eval_steps": 500, |
|
"global_step": 101465, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.001997437540038437, |
|
"loss": 5.9174, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0019948750800768736, |
|
"loss": 5.8554, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0019923126201153107, |
|
"loss": 5.8681, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 0.0019897501601537477, |
|
"loss": 5.8742, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 0.0019871877001921847, |
|
"loss": 5.8751, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.0019846252402306217, |
|
"loss": 5.8581, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 0.0019820627802690583, |
|
"loss": 5.8585, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 0.0019795003203074953, |
|
"loss": 5.851, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 0.001976937860345932, |
|
"loss": 5.8471, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 0.001974375400384369, |
|
"loss": 5.8427, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.001971812940422806, |
|
"loss": 5.8416, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 0.001969250480461243, |
|
"loss": 5.8367, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 0.00196668802049968, |
|
"loss": 5.839, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 0.0019641255605381165, |
|
"loss": 5.8303, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 0.0019615631005765535, |
|
"loss": 5.8259, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 5.826081275939941, |
|
"eval_runtime": 2.7402, |
|
"eval_samples_per_second": 364.938, |
|
"eval_steps_per_second": 2.92, |
|
"step": 7805 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"learning_rate": 0.0019590006406149905, |
|
"loss": 5.8257, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"learning_rate": 0.001956438180653427, |
|
"loss": 5.8125, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"learning_rate": 0.001953875720691864, |
|
"loss": 5.8103, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"learning_rate": 0.001951313260730301, |
|
"loss": 5.8115, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"learning_rate": 0.001948750800768738, |
|
"loss": 5.8042, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"learning_rate": 0.001946188340807175, |
|
"loss": 5.8003, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"learning_rate": 0.0019436258808456118, |
|
"loss": 5.7984, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 0.0019410634208840488, |
|
"loss": 5.7964, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"learning_rate": 0.0019385009609224858, |
|
"loss": 5.7938, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"learning_rate": 0.0019359385009609226, |
|
"loss": 5.789, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 0.0019333760409993594, |
|
"loss": 5.785, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"learning_rate": 0.0019308135810377962, |
|
"loss": 5.7901, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"learning_rate": 0.0019282511210762332, |
|
"loss": 5.7844, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"learning_rate": 0.0019256886611146702, |
|
"loss": 5.7802, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"learning_rate": 0.001923126201153107, |
|
"loss": 5.7819, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 0.001920563741191544, |
|
"loss": 5.7801, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 5.780016899108887, |
|
"eval_runtime": 2.7486, |
|
"eval_samples_per_second": 363.821, |
|
"eval_steps_per_second": 2.911, |
|
"step": 15610 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"learning_rate": 0.0019180012812299808, |
|
"loss": 5.7782, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"learning_rate": 0.0019154388212684176, |
|
"loss": 5.7768, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"learning_rate": 0.0019128763613068546, |
|
"loss": 5.7762, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"learning_rate": 0.0019103139013452914, |
|
"loss": 5.7759, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"learning_rate": 0.0019077514413837285, |
|
"loss": 5.7745, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"learning_rate": 0.0019051889814221653, |
|
"loss": 5.7711, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"learning_rate": 0.0019026265214606023, |
|
"loss": 5.7691, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"learning_rate": 0.0019000640614990393, |
|
"loss": 5.7619, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"learning_rate": 0.0018975016015374759, |
|
"loss": 5.7638, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"learning_rate": 0.0018949391415759129, |
|
"loss": 5.7628, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"learning_rate": 0.0018923766816143497, |
|
"loss": 5.7595, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"learning_rate": 0.0018898142216527867, |
|
"loss": 5.762, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"learning_rate": 0.0018872517616912237, |
|
"loss": 5.7616, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"learning_rate": 0.0018846893017296605, |
|
"loss": 5.769, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"learning_rate": 0.0018821268417680975, |
|
"loss": 5.7605, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 5.762243270874023, |
|
"eval_runtime": 2.7574, |
|
"eval_samples_per_second": 362.657, |
|
"eval_steps_per_second": 2.901, |
|
"step": 23415 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"learning_rate": 0.0018795643818065343, |
|
"loss": 5.7585, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"learning_rate": 0.0018770019218449711, |
|
"loss": 5.7596, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"learning_rate": 0.0018744394618834081, |
|
"loss": 5.7545, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"learning_rate": 0.001871877001921845, |
|
"loss": 5.7538, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"learning_rate": 0.001869314541960282, |
|
"loss": 5.7587, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"learning_rate": 0.0018667520819987187, |
|
"loss": 5.7575, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"learning_rate": 0.0018641896220371558, |
|
"loss": 5.7585, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"learning_rate": 0.0018616271620755928, |
|
"loss": 5.7565, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"learning_rate": 0.0018590647021140294, |
|
"loss": 5.7532, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"learning_rate": 0.0018565022421524664, |
|
"loss": 5.7532, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"learning_rate": 0.0018539397821909032, |
|
"loss": 5.7537, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"learning_rate": 0.0018513773222293402, |
|
"loss": 5.753, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"learning_rate": 0.0018488148622677772, |
|
"loss": 5.7499, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"learning_rate": 0.001846252402306214, |
|
"loss": 5.748, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"learning_rate": 0.001843689942344651, |
|
"loss": 5.7494, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"learning_rate": 0.0018411274823830876, |
|
"loss": 5.7497, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 5.748880863189697, |
|
"eval_runtime": 2.7417, |
|
"eval_samples_per_second": 364.734, |
|
"eval_steps_per_second": 2.918, |
|
"step": 31220 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"learning_rate": 0.0018385650224215246, |
|
"loss": 5.746, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"learning_rate": 0.0018360025624599616, |
|
"loss": 5.744, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"learning_rate": 0.0018334401024983984, |
|
"loss": 5.7402, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"learning_rate": 0.0018308776425368354, |
|
"loss": 5.7429, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"learning_rate": 0.0018283151825752725, |
|
"loss": 5.7466, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"learning_rate": 0.0018257527226137093, |
|
"loss": 5.7444, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"learning_rate": 0.0018231902626521463, |
|
"loss": 5.744, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"learning_rate": 0.0018206278026905828, |
|
"loss": 5.7411, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"learning_rate": 0.0018180653427290199, |
|
"loss": 5.741, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"learning_rate": 0.0018155028827674569, |
|
"loss": 5.7425, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"learning_rate": 0.0018129404228058937, |
|
"loss": 5.7388, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"learning_rate": 0.0018103779628443307, |
|
"loss": 5.7413, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"learning_rate": 0.0018078155028827675, |
|
"loss": 5.7428, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"learning_rate": 0.0018052530429212045, |
|
"loss": 5.7419, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"learning_rate": 0.0018026905829596413, |
|
"loss": 5.7407, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"learning_rate": 0.001800128122998078, |
|
"loss": 5.7419, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 5.745668411254883, |
|
"eval_runtime": 2.7372, |
|
"eval_samples_per_second": 365.338, |
|
"eval_steps_per_second": 2.923, |
|
"step": 39025 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"learning_rate": 0.0017975656630365151, |
|
"loss": 5.7408, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"learning_rate": 0.001795003203074952, |
|
"loss": 5.738, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"learning_rate": 0.001792440743113389, |
|
"loss": 5.7374, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"learning_rate": 0.001789878283151826, |
|
"loss": 5.7356, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"learning_rate": 0.0017873158231902627, |
|
"loss": 5.7342, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"learning_rate": 0.0017847533632286995, |
|
"loss": 5.7325, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"learning_rate": 0.0017821909032671363, |
|
"loss": 5.733, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 5.51, |
|
"learning_rate": 0.0017796284433055734, |
|
"loss": 5.7308, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"learning_rate": 0.0017770659833440104, |
|
"loss": 5.7293, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"learning_rate": 0.0017745035233824472, |
|
"loss": 5.7285, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"learning_rate": 0.0017719410634208842, |
|
"loss": 5.7295, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 5.77, |
|
"learning_rate": 0.001769378603459321, |
|
"loss": 5.7277, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 5.83, |
|
"learning_rate": 0.0017668161434977578, |
|
"loss": 5.729, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 5.89, |
|
"learning_rate": 0.0017642536835361948, |
|
"loss": 5.7304, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"learning_rate": 0.0017616912235746316, |
|
"loss": 5.7266, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 5.728877067565918, |
|
"eval_runtime": 2.7253, |
|
"eval_samples_per_second": 366.932, |
|
"eval_steps_per_second": 2.935, |
|
"step": 46830 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"learning_rate": 0.0017591287636130686, |
|
"loss": 5.726, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 6.09, |
|
"learning_rate": 0.0017565663036515054, |
|
"loss": 5.7246, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"learning_rate": 0.0017540038436899424, |
|
"loss": 5.7244, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 6.21, |
|
"learning_rate": 0.0017514413837283794, |
|
"loss": 5.7297, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"learning_rate": 0.001748878923766816, |
|
"loss": 5.7275, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 6.34, |
|
"learning_rate": 0.001746316463805253, |
|
"loss": 5.7255, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 6.41, |
|
"learning_rate": 0.0017437540038436898, |
|
"loss": 5.7255, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 6.47, |
|
"learning_rate": 0.0017411915438821268, |
|
"loss": 5.7233, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 6.53, |
|
"learning_rate": 0.0017386290839205639, |
|
"loss": 5.7238, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"learning_rate": 0.0017360666239590007, |
|
"loss": 5.7246, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 6.66, |
|
"learning_rate": 0.0017335041639974377, |
|
"loss": 5.7229, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"learning_rate": 0.0017309417040358745, |
|
"loss": 5.7221, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 6.79, |
|
"learning_rate": 0.0017283792440743113, |
|
"loss": 5.7184, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 6.85, |
|
"learning_rate": 0.0017258167841127483, |
|
"loss": 5.7186, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"learning_rate": 0.001723254324151185, |
|
"loss": 5.7164, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 6.98, |
|
"learning_rate": 0.001720691864189622, |
|
"loss": 5.7184, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 5.722599029541016, |
|
"eval_runtime": 2.7479, |
|
"eval_samples_per_second": 363.909, |
|
"eval_steps_per_second": 2.911, |
|
"step": 54635 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"learning_rate": 0.0017181294042280591, |
|
"loss": 5.7188, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 7.11, |
|
"learning_rate": 0.001715566944266496, |
|
"loss": 5.7174, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"learning_rate": 0.001713004484304933, |
|
"loss": 5.719, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 7.24, |
|
"learning_rate": 0.0017104420243433695, |
|
"loss": 5.7205, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 7.3, |
|
"learning_rate": 0.0017078795643818065, |
|
"loss": 5.7193, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"learning_rate": 0.0017053171044202435, |
|
"loss": 5.7191, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 7.43, |
|
"learning_rate": 0.0017027546444586803, |
|
"loss": 5.72, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"learning_rate": 0.0017001921844971173, |
|
"loss": 5.7217, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 7.56, |
|
"learning_rate": 0.0016976297245355541, |
|
"loss": 5.7185, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 7.62, |
|
"learning_rate": 0.0016950672645739912, |
|
"loss": 5.7162, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"learning_rate": 0.001692504804612428, |
|
"loss": 5.7192, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"learning_rate": 0.0016899423446508648, |
|
"loss": 5.7198, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 7.82, |
|
"learning_rate": 0.0016873798846893018, |
|
"loss": 5.7178, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"learning_rate": 0.0016848174247277386, |
|
"loss": 5.7165, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 7.94, |
|
"learning_rate": 0.0016822549647661756, |
|
"loss": 5.7171, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 5.719506740570068, |
|
"eval_runtime": 2.7211, |
|
"eval_samples_per_second": 367.5, |
|
"eval_steps_per_second": 2.94, |
|
"step": 62440 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"learning_rate": 0.0016796925048046126, |
|
"loss": 5.7152, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"learning_rate": 0.0016771300448430494, |
|
"loss": 5.7136, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 8.14, |
|
"learning_rate": 0.0016745675848814864, |
|
"loss": 5.7135, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"learning_rate": 0.001672005124919923, |
|
"loss": 5.7165, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 8.26, |
|
"learning_rate": 0.00166944266495836, |
|
"loss": 5.7167, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"learning_rate": 0.001666880204996797, |
|
"loss": 5.7127, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"learning_rate": 0.0016643177450352338, |
|
"loss": 5.7157, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"learning_rate": 0.0016617552850736708, |
|
"loss": 5.7133, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 8.52, |
|
"learning_rate": 0.0016591928251121076, |
|
"loss": 5.7132, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 8.58, |
|
"learning_rate": 0.0016566303651505446, |
|
"loss": 5.713, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"learning_rate": 0.0016540679051889814, |
|
"loss": 5.7163, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"learning_rate": 0.0016515054452274182, |
|
"loss": 5.7174, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 8.78, |
|
"learning_rate": 0.0016489429852658553, |
|
"loss": 5.7185, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 8.84, |
|
"learning_rate": 0.001646380525304292, |
|
"loss": 5.7133, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"learning_rate": 0.001643818065342729, |
|
"loss": 5.7148, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 8.97, |
|
"learning_rate": 0.001641255605381166, |
|
"loss": 5.7123, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 5.7155256271362305, |
|
"eval_runtime": 2.7307, |
|
"eval_samples_per_second": 366.201, |
|
"eval_steps_per_second": 2.93, |
|
"step": 70245 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"learning_rate": 0.0016386931454196029, |
|
"loss": 5.7097, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"learning_rate": 0.0016361306854580397, |
|
"loss": 5.7112, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 9.16, |
|
"learning_rate": 0.0016335682254964765, |
|
"loss": 5.7103, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 9.22, |
|
"learning_rate": 0.0016310057655349135, |
|
"loss": 5.7085, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 9.29, |
|
"learning_rate": 0.0016284433055733505, |
|
"loss": 5.7077, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 9.35, |
|
"learning_rate": 0.0016258808456117873, |
|
"loss": 5.7101, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 9.42, |
|
"learning_rate": 0.0016233183856502243, |
|
"loss": 5.7085, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 9.48, |
|
"learning_rate": 0.0016207559256886613, |
|
"loss": 5.7091, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"learning_rate": 0.001618193465727098, |
|
"loss": 5.7098, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 9.61, |
|
"learning_rate": 0.001615631005765535, |
|
"loss": 5.7136, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 9.67, |
|
"learning_rate": 0.0016130685458039717, |
|
"loss": 5.7154, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"learning_rate": 0.0016105060858424087, |
|
"loss": 5.7117, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"learning_rate": 0.0016079436258808458, |
|
"loss": 5.713, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 9.87, |
|
"learning_rate": 0.0016053811659192826, |
|
"loss": 5.7138, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 9.93, |
|
"learning_rate": 0.0016028187059577196, |
|
"loss": 5.7177, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 9.99, |
|
"learning_rate": 0.0016002562459961564, |
|
"loss": 5.7123, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 5.717950344085693, |
|
"eval_runtime": 2.833, |
|
"eval_samples_per_second": 352.98, |
|
"eval_steps_per_second": 2.824, |
|
"step": 78050 |
|
}, |
|
{ |
|
"epoch": 10.06, |
|
"learning_rate": 0.0015976937860345932, |
|
"loss": 5.7121, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 10.12, |
|
"learning_rate": 0.0015951313260730302, |
|
"loss": 5.7098, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 10.19, |
|
"learning_rate": 0.001592568866111467, |
|
"loss": 5.7113, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 10.25, |
|
"learning_rate": 0.001590006406149904, |
|
"loss": 5.7108, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 10.31, |
|
"learning_rate": 0.0015874439461883408, |
|
"loss": 5.7136, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"learning_rate": 0.0015848814862267778, |
|
"loss": 5.711, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 10.44, |
|
"learning_rate": 0.0015823190262652148, |
|
"loss": 5.7088, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 10.51, |
|
"learning_rate": 0.0015797565663036514, |
|
"loss": 5.7096, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 10.57, |
|
"learning_rate": 0.0015771941063420884, |
|
"loss": 5.7069, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 10.63, |
|
"learning_rate": 0.0015746316463805252, |
|
"loss": 5.7056, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 10.7, |
|
"learning_rate": 0.0015720691864189622, |
|
"loss": 5.7049, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 10.76, |
|
"learning_rate": 0.0015695067264573993, |
|
"loss": 5.7059, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 10.83, |
|
"learning_rate": 0.001566944266495836, |
|
"loss": 5.7057, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 10.89, |
|
"learning_rate": 0.001564381806534273, |
|
"loss": 5.7048, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 10.95, |
|
"learning_rate": 0.0015618193465727096, |
|
"loss": 5.7071, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 5.710795879364014, |
|
"eval_runtime": 2.7348, |
|
"eval_samples_per_second": 365.653, |
|
"eval_steps_per_second": 2.925, |
|
"step": 85855 |
|
}, |
|
{ |
|
"epoch": 11.02, |
|
"learning_rate": 0.0015592568866111467, |
|
"loss": 5.7076, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 11.08, |
|
"learning_rate": 0.0015566944266495837, |
|
"loss": 5.7062, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 11.15, |
|
"learning_rate": 0.0015541319666880205, |
|
"loss": 5.7041, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 11.21, |
|
"learning_rate": 0.0015515695067264575, |
|
"loss": 5.7024, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 11.27, |
|
"learning_rate": 0.0015490070467648943, |
|
"loss": 5.7024, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 11.34, |
|
"learning_rate": 0.0015464445868033313, |
|
"loss": 5.7031, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"learning_rate": 0.001543882126841768, |
|
"loss": 5.7031, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 11.47, |
|
"learning_rate": 0.001541319666880205, |
|
"loss": 5.7028, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 11.53, |
|
"learning_rate": 0.001538757206918642, |
|
"loss": 5.6996, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 11.6, |
|
"learning_rate": 0.0015361947469570787, |
|
"loss": 5.7012, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 11.66, |
|
"learning_rate": 0.0015336322869955157, |
|
"loss": 5.7018, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 11.72, |
|
"learning_rate": 0.0015310698270339527, |
|
"loss": 5.7001, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 11.79, |
|
"learning_rate": 0.0015285073670723895, |
|
"loss": 5.7005, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 11.85, |
|
"learning_rate": 0.0015259449071108266, |
|
"loss": 5.7001, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 11.92, |
|
"learning_rate": 0.0015233824471492631, |
|
"loss": 5.6995, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 11.98, |
|
"learning_rate": 0.0015208199871877002, |
|
"loss": 5.7, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 5.703713893890381, |
|
"eval_runtime": 2.754, |
|
"eval_samples_per_second": 363.115, |
|
"eval_steps_per_second": 2.905, |
|
"step": 93660 |
|
}, |
|
{ |
|
"epoch": 12.04, |
|
"learning_rate": 0.0015182575272261372, |
|
"loss": 5.7018, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 12.11, |
|
"learning_rate": 0.001515695067264574, |
|
"loss": 5.6988, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 12.17, |
|
"learning_rate": 0.001513132607303011, |
|
"loss": 5.6973, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 12.24, |
|
"learning_rate": 0.001510570147341448, |
|
"loss": 5.6993, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 12.3, |
|
"learning_rate": 0.0015080076873798848, |
|
"loss": 5.6985, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 12.36, |
|
"learning_rate": 0.0015054452274183216, |
|
"loss": 5.6973, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 12.43, |
|
"learning_rate": 0.0015028827674567584, |
|
"loss": 5.6965, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 12.49, |
|
"learning_rate": 0.0015003203074951954, |
|
"loss": 5.6981, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 12.56, |
|
"learning_rate": 0.0014977578475336324, |
|
"loss": 5.6958, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 12.62, |
|
"learning_rate": 0.0014951953875720692, |
|
"loss": 5.6971, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 12.68, |
|
"learning_rate": 0.0014926329276105062, |
|
"loss": 5.697, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 12.75, |
|
"learning_rate": 0.001490070467648943, |
|
"loss": 5.6977, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 12.81, |
|
"learning_rate": 0.0014875080076873798, |
|
"loss": 5.699, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 12.88, |
|
"learning_rate": 0.0014849455477258168, |
|
"loss": 5.6974, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 12.94, |
|
"learning_rate": 0.0014823830877642536, |
|
"loss": 5.6984, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 5.700723648071289, |
|
"eval_runtime": 2.8976, |
|
"eval_samples_per_second": 345.111, |
|
"eval_steps_per_second": 2.761, |
|
"step": 101465 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 390250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 2.652474175782912e+18, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|