|
{ |
|
"best_metric": 64.42910915934755, |
|
"best_model_checkpoint": "/scratch/camembertv2/runs/results/fquad/camembertav2-base-bf16-p2-17000/max_seq_length-896-doc_stride-128-max_answer_length-30-gradient_accumulation_steps-2-precision-fp32-learning_rate-3e-05-epochs-6-lr_scheduler-cosine-warmup_steps-0/SEED-1/checkpoint-6480", |
|
"epoch": 6.0, |
|
"eval_steps": 500, |
|
"global_step": 7776, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07716049382716049, |
|
"grad_norm": 23.665807723999023, |
|
"learning_rate": 2.998775977415799e-05, |
|
"loss": 3.705, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15432098765432098, |
|
"grad_norm": 20.01573944091797, |
|
"learning_rate": 2.9951059073049117e-05, |
|
"loss": 1.6565, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.23148148148148148, |
|
"grad_norm": 12.386185646057129, |
|
"learning_rate": 2.988995779332273e-05, |
|
"loss": 1.4648, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.30864197530864196, |
|
"grad_norm": 18.621143341064453, |
|
"learning_rate": 2.980455565410724e-05, |
|
"loss": 1.243, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.38580246913580246, |
|
"grad_norm": 15.412405967712402, |
|
"learning_rate": 2.96949920342655e-05, |
|
"loss": 1.1382, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.46296296296296297, |
|
"grad_norm": 18.231019973754883, |
|
"learning_rate": 2.95614457449243e-05, |
|
"loss": 1.1534, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5401234567901234, |
|
"grad_norm": 16.341306686401367, |
|
"learning_rate": 2.940413473764923e-05, |
|
"loss": 1.0392, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6172839506172839, |
|
"grad_norm": 12.782474517822266, |
|
"learning_rate": 2.9223315748741146e-05, |
|
"loss": 1.0394, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6944444444444444, |
|
"grad_norm": 15.871848106384277, |
|
"learning_rate": 2.9019283880234828e-05, |
|
"loss": 0.9878, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7716049382716049, |
|
"grad_norm": 8.874638557434082, |
|
"learning_rate": 2.879237211828353e-05, |
|
"loss": 1.0025, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8487654320987654, |
|
"grad_norm": 19.52781867980957, |
|
"learning_rate": 2.8542950789715587e-05, |
|
"loss": 0.9396, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 11.584712982177734, |
|
"learning_rate": 2.8271426957649866e-05, |
|
"loss": 0.9625, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_exact_match": 61.91969887076537, |
|
"eval_f1": 81.01142429829103, |
|
"eval_runtime": 46.1272, |
|
"eval_samples_per_second": 69.113, |
|
"eval_steps_per_second": 1.084, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 1.0030864197530864, |
|
"grad_norm": 7.031412601470947, |
|
"learning_rate": 2.7978243757156497e-05, |
|
"loss": 0.9411, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.0802469135802468, |
|
"grad_norm": 11.493968963623047, |
|
"learning_rate": 2.7663879672047095e-05, |
|
"loss": 0.6781, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.1574074074074074, |
|
"grad_norm": 5.261780738830566, |
|
"learning_rate": 2.732884775397477e-05, |
|
"loss": 0.6742, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.2345679012345678, |
|
"grad_norm": 13.310417175292969, |
|
"learning_rate": 2.6973694785118392e-05, |
|
"loss": 0.7004, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.3117283950617284, |
|
"grad_norm": 17.390798568725586, |
|
"learning_rate": 2.65990003858176e-05, |
|
"loss": 0.6512, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 14.149763107299805, |
|
"learning_rate": 2.620537606861494e-05, |
|
"loss": 0.7074, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.4660493827160495, |
|
"grad_norm": 5.80026912689209, |
|
"learning_rate": 2.5793464240249014e-05, |
|
"loss": 0.6629, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.5432098765432098, |
|
"grad_norm": 17.484006881713867, |
|
"learning_rate": 2.536393715322732e-05, |
|
"loss": 0.6993, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.6203703703703702, |
|
"grad_norm": 21.80438804626465, |
|
"learning_rate": 2.49174958086899e-05, |
|
"loss": 0.6408, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.6975308641975309, |
|
"grad_norm": 19.416994094848633, |
|
"learning_rate": 2.4454868812354406e-05, |
|
"loss": 0.6592, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.7746913580246915, |
|
"grad_norm": 5.628990650177002, |
|
"learning_rate": 2.3976811185409607e-05, |
|
"loss": 0.622, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 14.69774341583252, |
|
"learning_rate": 2.3484103132298082e-05, |
|
"loss": 0.645, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.9290123456790123, |
|
"grad_norm": 13.895576477050781, |
|
"learning_rate": 2.297754876739905e-05, |
|
"loss": 0.6746, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_exact_match": 63.26850690087829, |
|
"eval_f1": 82.69346269167056, |
|
"eval_runtime": 45.8773, |
|
"eval_samples_per_second": 69.49, |
|
"eval_steps_per_second": 1.09, |
|
"step": 2592 |
|
}, |
|
{ |
|
"epoch": 2.006172839506173, |
|
"grad_norm": 7.386751651763916, |
|
"learning_rate": 2.2457974802689542e-05, |
|
"loss": 0.6472, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 12.677722930908203, |
|
"learning_rate": 2.192622919852551e-05, |
|
"loss": 0.4365, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.1604938271604937, |
|
"grad_norm": 7.171934127807617, |
|
"learning_rate": 2.138317977974501e-05, |
|
"loss": 0.4287, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.2376543209876543, |
|
"grad_norm": 10.016780853271484, |
|
"learning_rate": 2.082971281935195e-05, |
|
"loss": 0.4462, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.314814814814815, |
|
"grad_norm": 21.301910400390625, |
|
"learning_rate": 2.0266731592091834e-05, |
|
"loss": 0.4425, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.3919753086419755, |
|
"grad_norm": 21.78326988220215, |
|
"learning_rate": 1.969515490028019e-05, |
|
"loss": 0.425, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.4691358024691357, |
|
"grad_norm": 17.772539138793945, |
|
"learning_rate": 1.9115915574289523e-05, |
|
"loss": 0.4181, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.5462962962962963, |
|
"grad_norm": 7.547439098358154, |
|
"learning_rate": 1.8529958950142064e-05, |
|
"loss": 0.4233, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.623456790123457, |
|
"grad_norm": 9.031538963317871, |
|
"learning_rate": 1.7938241326692906e-05, |
|
"loss": 0.4691, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.700617283950617, |
|
"grad_norm": 9.722735404968262, |
|
"learning_rate": 1.734172840492147e-05, |
|
"loss": 0.4498, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 9.985281944274902, |
|
"learning_rate": 1.6741393711878455e-05, |
|
"loss": 0.4388, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.8549382716049383, |
|
"grad_norm": 9.514204978942871, |
|
"learning_rate": 1.6138217011860335e-05, |
|
"loss": 0.4501, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.932098765432099, |
|
"grad_norm": 16.88687515258789, |
|
"learning_rate": 1.5533182707404563e-05, |
|
"loss": 0.4172, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_exact_match": 63.676286072772896, |
|
"eval_f1": 82.60726439387956, |
|
"eval_runtime": 45.9653, |
|
"eval_samples_per_second": 69.357, |
|
"eval_steps_per_second": 1.088, |
|
"step": 3888 |
|
}, |
|
{ |
|
"epoch": 3.009259259259259, |
|
"grad_norm": 4.132925033569336, |
|
"learning_rate": 1.4927278232714974e-05, |
|
"loss": 0.3689, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.0864197530864197, |
|
"grad_norm": 9.779620170593262, |
|
"learning_rate": 1.4321492442139406e-05, |
|
"loss": 0.2905, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.1635802469135803, |
|
"grad_norm": 7.350837230682373, |
|
"learning_rate": 1.371681399632967e-05, |
|
"loss": 0.2937, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.240740740740741, |
|
"grad_norm": 6.923620223999023, |
|
"learning_rate": 1.3114229748717562e-05, |
|
"loss": 0.2922, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.317901234567901, |
|
"grad_norm": 16.84642791748047, |
|
"learning_rate": 1.2514723134940363e-05, |
|
"loss": 0.28, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.3950617283950617, |
|
"grad_norm": 22.180021286010742, |
|
"learning_rate": 1.191927256784427e-05, |
|
"loss": 0.2907, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.4722222222222223, |
|
"grad_norm": 2.5661354064941406, |
|
"learning_rate": 1.1328849840685143e-05, |
|
"loss": 0.2806, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.549382716049383, |
|
"grad_norm": 11.584675788879395, |
|
"learning_rate": 1.0744418541132676e-05, |
|
"loss": 0.2963, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.626543209876543, |
|
"grad_norm": 5.476423740386963, |
|
"learning_rate": 1.0166932478666293e-05, |
|
"loss": 0.3199, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 3.7037037037037037, |
|
"grad_norm": 9.405366897583008, |
|
"learning_rate": 9.597334127929346e-06, |
|
"loss": 0.3107, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.7808641975308643, |
|
"grad_norm": 8.880900382995605, |
|
"learning_rate": 9.036553090582144e-06, |
|
"loss": 0.2991, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 3.8580246913580245, |
|
"grad_norm": 3.8892629146575928, |
|
"learning_rate": 8.485504578164017e-06, |
|
"loss": 0.2716, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.935185185185185, |
|
"grad_norm": 5.704967498779297, |
|
"learning_rate": 7.945087918440563e-06, |
|
"loss": 0.2688, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_exact_match": 64.2409033877039, |
|
"eval_f1": 83.135484930466, |
|
"eval_runtime": 45.9298, |
|
"eval_samples_per_second": 69.41, |
|
"eval_steps_per_second": 1.089, |
|
"step": 5184 |
|
}, |
|
{ |
|
"epoch": 4.012345679012346, |
|
"grad_norm": 16.147579193115234, |
|
"learning_rate": 7.416185087673616e-06, |
|
"loss": 0.2919, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.089506172839506, |
|
"grad_norm": 13.380005836486816, |
|
"learning_rate": 6.899659271209459e-06, |
|
"loss": 0.2068, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 9.491084098815918, |
|
"learning_rate": 6.3963534547343126e-06, |
|
"loss": 0.2009, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 4.243827160493828, |
|
"grad_norm": 14.11040210723877, |
|
"learning_rate": 5.907089048496351e-06, |
|
"loss": 0.2124, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 4.320987654320987, |
|
"grad_norm": 12.674304962158203, |
|
"learning_rate": 5.4326645467394085e-06, |
|
"loss": 0.2173, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 4.398148148148148, |
|
"grad_norm": 5.682621955871582, |
|
"learning_rate": 4.973854224536363e-06, |
|
"loss": 0.213, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 4.4753086419753085, |
|
"grad_norm": 5.133475303649902, |
|
"learning_rate": 4.5314068741488615e-06, |
|
"loss": 0.2, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 4.552469135802469, |
|
"grad_norm": 6.370384693145752, |
|
"learning_rate": 4.1060445829758305e-06, |
|
"loss": 0.197, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 4.62962962962963, |
|
"grad_norm": 16.37765884399414, |
|
"learning_rate": 3.6984615550850894e-06, |
|
"loss": 0.2051, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 4.70679012345679, |
|
"grad_norm": 11.54761791229248, |
|
"learning_rate": 3.3093229782514023e-06, |
|
"loss": 0.1733, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 4.783950617283951, |
|
"grad_norm": 22.175281524658203, |
|
"learning_rate": 2.939263938350012e-06, |
|
"loss": 0.2003, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 4.861111111111111, |
|
"grad_norm": 1.2753137350082397, |
|
"learning_rate": 2.588888382877342e-06, |
|
"loss": 0.194, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 4.938271604938271, |
|
"grad_norm": 32.319236755371094, |
|
"learning_rate": 2.2587681352905404e-06, |
|
"loss": 0.2149, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_exact_match": 64.42910915934755, |
|
"eval_f1": 83.36016013340664, |
|
"eval_runtime": 45.8927, |
|
"eval_samples_per_second": 69.466, |
|
"eval_steps_per_second": 1.089, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 5.015432098765432, |
|
"grad_norm": 9.053484916687012, |
|
"learning_rate": 1.9494419617743312e-06, |
|
"loss": 0.198, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 5.092592592592593, |
|
"grad_norm": 11.60733699798584, |
|
"learning_rate": 1.6614146919584094e-06, |
|
"loss": 0.1512, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 5.169753086419753, |
|
"grad_norm": 9.327279090881348, |
|
"learning_rate": 1.3951563950202656e-06, |
|
"loss": 0.167, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 5.246913580246914, |
|
"grad_norm": 4.551391124725342, |
|
"learning_rate": 1.1511016125181445e-06, |
|
"loss": 0.1315, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 5.324074074074074, |
|
"grad_norm": 4.2411274909973145, |
|
"learning_rate": 9.296486492061334e-07, |
|
"loss": 0.1532, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 5.401234567901234, |
|
"grad_norm": 5.1193461418151855, |
|
"learning_rate": 7.311589229888083e-07, |
|
"loss": 0.1624, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 5.478395061728395, |
|
"grad_norm": 1.735378384590149, |
|
"learning_rate": 5.55956375076332e-07, |
|
"loss": 0.1688, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 5.555555555555555, |
|
"grad_norm": 3.7359869480133057, |
|
"learning_rate": 4.043269413026429e-07, |
|
"loss": 0.148, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 5.632716049382716, |
|
"grad_norm": 6.915912628173828, |
|
"learning_rate": 2.7651808546956646e-07, |
|
"loss": 0.1822, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 5.709876543209877, |
|
"grad_norm": 5.2128448486328125, |
|
"learning_rate": 1.727383954784373e-07, |
|
"loss": 0.163, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 5.787037037037037, |
|
"grad_norm": 3.4656639099121094, |
|
"learning_rate": 9.315724290836047e-08, |
|
"loss": 0.1716, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 5.864197530864198, |
|
"grad_norm": 21.396251678466797, |
|
"learning_rate": 3.790450659670097e-08, |
|
"loss": 0.1694, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 5.9413580246913575, |
|
"grad_norm": 14.200843811035156, |
|
"learning_rate": 7.070360672907228e-09, |
|
"loss": 0.1618, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_exact_match": 64.0840652446675, |
|
"eval_f1": 83.12314115625247, |
|
"eval_runtime": 45.7565, |
|
"eval_samples_per_second": 69.673, |
|
"eval_steps_per_second": 1.093, |
|
"step": 7776 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"step": 7776, |
|
"total_flos": 2.0394634246921464e+16, |
|
"train_loss": 0.5145930189164087, |
|
"train_runtime": 3736.1381, |
|
"train_samples_per_second": 33.293, |
|
"train_steps_per_second": 2.081 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 7776, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.0394634246921464e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|