{ "best_metric": 64.42910915934755, "best_model_checkpoint": "/scratch/camembertv2/runs/results/fquad/camembertav2-base-bf16-p2-17000/max_seq_length-896-doc_stride-128-max_answer_length-30-gradient_accumulation_steps-2-precision-fp32-learning_rate-3e-05-epochs-6-lr_scheduler-cosine-warmup_steps-0/SEED-1/checkpoint-6480", "epoch": 6.0, "eval_steps": 500, "global_step": 7776, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07716049382716049, "grad_norm": 23.665807723999023, "learning_rate": 2.998775977415799e-05, "loss": 3.705, "step": 100 }, { "epoch": 0.15432098765432098, "grad_norm": 20.01573944091797, "learning_rate": 2.9951059073049117e-05, "loss": 1.6565, "step": 200 }, { "epoch": 0.23148148148148148, "grad_norm": 12.386185646057129, "learning_rate": 2.988995779332273e-05, "loss": 1.4648, "step": 300 }, { "epoch": 0.30864197530864196, "grad_norm": 18.621143341064453, "learning_rate": 2.980455565410724e-05, "loss": 1.243, "step": 400 }, { "epoch": 0.38580246913580246, "grad_norm": 15.412405967712402, "learning_rate": 2.96949920342655e-05, "loss": 1.1382, "step": 500 }, { "epoch": 0.46296296296296297, "grad_norm": 18.231019973754883, "learning_rate": 2.95614457449243e-05, "loss": 1.1534, "step": 600 }, { "epoch": 0.5401234567901234, "grad_norm": 16.341306686401367, "learning_rate": 2.940413473764923e-05, "loss": 1.0392, "step": 700 }, { "epoch": 0.6172839506172839, "grad_norm": 12.782474517822266, "learning_rate": 2.9223315748741146e-05, "loss": 1.0394, "step": 800 }, { "epoch": 0.6944444444444444, "grad_norm": 15.871848106384277, "learning_rate": 2.9019283880234828e-05, "loss": 0.9878, "step": 900 }, { "epoch": 0.7716049382716049, "grad_norm": 8.874638557434082, "learning_rate": 2.879237211828353e-05, "loss": 1.0025, "step": 1000 }, { "epoch": 0.8487654320987654, "grad_norm": 19.52781867980957, "learning_rate": 2.8542950789715587e-05, "loss": 0.9396, "step": 1100 }, { "epoch": 0.9259259259259259, "grad_norm": 11.584712982177734, "learning_rate": 2.8271426957649866e-05, "loss": 0.9625, "step": 1200 }, { "epoch": 1.0, "eval_exact_match": 61.91969887076537, "eval_f1": 81.01142429829103, "eval_runtime": 46.1272, "eval_samples_per_second": 69.113, "eval_steps_per_second": 1.084, "step": 1296 }, { "epoch": 1.0030864197530864, "grad_norm": 7.031412601470947, "learning_rate": 2.7978243757156497e-05, "loss": 0.9411, "step": 1300 }, { "epoch": 1.0802469135802468, "grad_norm": 11.493968963623047, "learning_rate": 2.7663879672047095e-05, "loss": 0.6781, "step": 1400 }, { "epoch": 1.1574074074074074, "grad_norm": 5.261780738830566, "learning_rate": 2.732884775397477e-05, "loss": 0.6742, "step": 1500 }, { "epoch": 1.2345679012345678, "grad_norm": 13.310417175292969, "learning_rate": 2.6973694785118392e-05, "loss": 0.7004, "step": 1600 }, { "epoch": 1.3117283950617284, "grad_norm": 17.390798568725586, "learning_rate": 2.65990003858176e-05, "loss": 0.6512, "step": 1700 }, { "epoch": 1.3888888888888888, "grad_norm": 14.149763107299805, "learning_rate": 2.620537606861494e-05, "loss": 0.7074, "step": 1800 }, { "epoch": 1.4660493827160495, "grad_norm": 5.80026912689209, "learning_rate": 2.5793464240249014e-05, "loss": 0.6629, "step": 1900 }, { "epoch": 1.5432098765432098, "grad_norm": 17.484006881713867, "learning_rate": 2.536393715322732e-05, "loss": 0.6993, "step": 2000 }, { "epoch": 1.6203703703703702, "grad_norm": 21.80438804626465, "learning_rate": 2.49174958086899e-05, "loss": 0.6408, "step": 2100 }, { "epoch": 1.6975308641975309, "grad_norm": 19.416994094848633, "learning_rate": 2.4454868812354406e-05, "loss": 0.6592, "step": 2200 }, { "epoch": 1.7746913580246915, "grad_norm": 5.628990650177002, "learning_rate": 2.3976811185409607e-05, "loss": 0.622, "step": 2300 }, { "epoch": 1.8518518518518519, "grad_norm": 14.69774341583252, "learning_rate": 2.3484103132298082e-05, "loss": 0.645, "step": 2400 }, { "epoch": 1.9290123456790123, "grad_norm": 13.895576477050781, "learning_rate": 2.297754876739905e-05, "loss": 0.6746, "step": 2500 }, { "epoch": 2.0, "eval_exact_match": 63.26850690087829, "eval_f1": 82.69346269167056, "eval_runtime": 45.8773, "eval_samples_per_second": 69.49, "eval_steps_per_second": 1.09, "step": 2592 }, { "epoch": 2.006172839506173, "grad_norm": 7.386751651763916, "learning_rate": 2.2457974802689542e-05, "loss": 0.6472, "step": 2600 }, { "epoch": 2.0833333333333335, "grad_norm": 12.677722930908203, "learning_rate": 2.192622919852551e-05, "loss": 0.4365, "step": 2700 }, { "epoch": 2.1604938271604937, "grad_norm": 7.171934127807617, "learning_rate": 2.138317977974501e-05, "loss": 0.4287, "step": 2800 }, { "epoch": 2.2376543209876543, "grad_norm": 10.016780853271484, "learning_rate": 2.082971281935195e-05, "loss": 0.4462, "step": 2900 }, { "epoch": 2.314814814814815, "grad_norm": 21.301910400390625, "learning_rate": 2.0266731592091834e-05, "loss": 0.4425, "step": 3000 }, { "epoch": 2.3919753086419755, "grad_norm": 21.78326988220215, "learning_rate": 1.969515490028019e-05, "loss": 0.425, "step": 3100 }, { "epoch": 2.4691358024691357, "grad_norm": 17.772539138793945, "learning_rate": 1.9115915574289523e-05, "loss": 0.4181, "step": 3200 }, { "epoch": 2.5462962962962963, "grad_norm": 7.547439098358154, "learning_rate": 1.8529958950142064e-05, "loss": 0.4233, "step": 3300 }, { "epoch": 2.623456790123457, "grad_norm": 9.031538963317871, "learning_rate": 1.7938241326692906e-05, "loss": 0.4691, "step": 3400 }, { "epoch": 2.700617283950617, "grad_norm": 9.722735404968262, "learning_rate": 1.734172840492147e-05, "loss": 0.4498, "step": 3500 }, { "epoch": 2.7777777777777777, "grad_norm": 9.985281944274902, "learning_rate": 1.6741393711878455e-05, "loss": 0.4388, "step": 3600 }, { "epoch": 2.8549382716049383, "grad_norm": 9.514204978942871, "learning_rate": 1.6138217011860335e-05, "loss": 0.4501, "step": 3700 }, { "epoch": 2.932098765432099, "grad_norm": 16.88687515258789, "learning_rate": 1.5533182707404563e-05, "loss": 0.4172, "step": 3800 }, { "epoch": 3.0, "eval_exact_match": 63.676286072772896, "eval_f1": 82.60726439387956, "eval_runtime": 45.9653, "eval_samples_per_second": 69.357, "eval_steps_per_second": 1.088, "step": 3888 }, { "epoch": 3.009259259259259, "grad_norm": 4.132925033569336, "learning_rate": 1.4927278232714974e-05, "loss": 0.3689, "step": 3900 }, { "epoch": 3.0864197530864197, "grad_norm": 9.779620170593262, "learning_rate": 1.4321492442139406e-05, "loss": 0.2905, "step": 4000 }, { "epoch": 3.1635802469135803, "grad_norm": 7.350837230682373, "learning_rate": 1.371681399632967e-05, "loss": 0.2937, "step": 4100 }, { "epoch": 3.240740740740741, "grad_norm": 6.923620223999023, "learning_rate": 1.3114229748717562e-05, "loss": 0.2922, "step": 4200 }, { "epoch": 3.317901234567901, "grad_norm": 16.84642791748047, "learning_rate": 1.2514723134940363e-05, "loss": 0.28, "step": 4300 }, { "epoch": 3.3950617283950617, "grad_norm": 22.180021286010742, "learning_rate": 1.191927256784427e-05, "loss": 0.2907, "step": 4400 }, { "epoch": 3.4722222222222223, "grad_norm": 2.5661354064941406, "learning_rate": 1.1328849840685143e-05, "loss": 0.2806, "step": 4500 }, { "epoch": 3.549382716049383, "grad_norm": 11.584675788879395, "learning_rate": 1.0744418541132676e-05, "loss": 0.2963, "step": 4600 }, { "epoch": 3.626543209876543, "grad_norm": 5.476423740386963, "learning_rate": 1.0166932478666293e-05, "loss": 0.3199, "step": 4700 }, { "epoch": 3.7037037037037037, "grad_norm": 9.405366897583008, "learning_rate": 9.597334127929346e-06, "loss": 0.3107, "step": 4800 }, { "epoch": 3.7808641975308643, "grad_norm": 8.880900382995605, "learning_rate": 9.036553090582144e-06, "loss": 0.2991, "step": 4900 }, { "epoch": 3.8580246913580245, "grad_norm": 3.8892629146575928, "learning_rate": 8.485504578164017e-06, "loss": 0.2716, "step": 5000 }, { "epoch": 3.935185185185185, "grad_norm": 5.704967498779297, "learning_rate": 7.945087918440563e-06, "loss": 0.2688, "step": 5100 }, { "epoch": 4.0, "eval_exact_match": 64.2409033877039, "eval_f1": 83.135484930466, "eval_runtime": 45.9298, "eval_samples_per_second": 69.41, "eval_steps_per_second": 1.089, "step": 5184 }, { "epoch": 4.012345679012346, "grad_norm": 16.147579193115234, "learning_rate": 7.416185087673616e-06, "loss": 0.2919, "step": 5200 }, { "epoch": 4.089506172839506, "grad_norm": 13.380005836486816, "learning_rate": 6.899659271209459e-06, "loss": 0.2068, "step": 5300 }, { "epoch": 4.166666666666667, "grad_norm": 9.491084098815918, "learning_rate": 6.3963534547343126e-06, "loss": 0.2009, "step": 5400 }, { "epoch": 4.243827160493828, "grad_norm": 14.11040210723877, "learning_rate": 5.907089048496351e-06, "loss": 0.2124, "step": 5500 }, { "epoch": 4.320987654320987, "grad_norm": 12.674304962158203, "learning_rate": 5.4326645467394085e-06, "loss": 0.2173, "step": 5600 }, { "epoch": 4.398148148148148, "grad_norm": 5.682621955871582, "learning_rate": 4.973854224536363e-06, "loss": 0.213, "step": 5700 }, { "epoch": 4.4753086419753085, "grad_norm": 5.133475303649902, "learning_rate": 4.5314068741488615e-06, "loss": 0.2, "step": 5800 }, { "epoch": 4.552469135802469, "grad_norm": 6.370384693145752, "learning_rate": 4.1060445829758305e-06, "loss": 0.197, "step": 5900 }, { "epoch": 4.62962962962963, "grad_norm": 16.37765884399414, "learning_rate": 3.6984615550850894e-06, "loss": 0.2051, "step": 6000 }, { "epoch": 4.70679012345679, "grad_norm": 11.54761791229248, "learning_rate": 3.3093229782514023e-06, "loss": 0.1733, "step": 6100 }, { "epoch": 4.783950617283951, "grad_norm": 22.175281524658203, "learning_rate": 2.939263938350012e-06, "loss": 0.2003, "step": 6200 }, { "epoch": 4.861111111111111, "grad_norm": 1.2753137350082397, "learning_rate": 2.588888382877342e-06, "loss": 0.194, "step": 6300 }, { "epoch": 4.938271604938271, "grad_norm": 32.319236755371094, "learning_rate": 2.2587681352905404e-06, "loss": 0.2149, "step": 6400 }, { "epoch": 5.0, "eval_exact_match": 64.42910915934755, "eval_f1": 83.36016013340664, "eval_runtime": 45.8927, "eval_samples_per_second": 69.466, "eval_steps_per_second": 1.089, "step": 6480 }, { "epoch": 5.015432098765432, "grad_norm": 9.053484916687012, "learning_rate": 1.9494419617743312e-06, "loss": 0.198, "step": 6500 }, { "epoch": 5.092592592592593, "grad_norm": 11.60733699798584, "learning_rate": 1.6614146919584094e-06, "loss": 0.1512, "step": 6600 }, { "epoch": 5.169753086419753, "grad_norm": 9.327279090881348, "learning_rate": 1.3951563950202656e-06, "loss": 0.167, "step": 6700 }, { "epoch": 5.246913580246914, "grad_norm": 4.551391124725342, "learning_rate": 1.1511016125181445e-06, "loss": 0.1315, "step": 6800 }, { "epoch": 5.324074074074074, "grad_norm": 4.2411274909973145, "learning_rate": 9.296486492061334e-07, "loss": 0.1532, "step": 6900 }, { "epoch": 5.401234567901234, "grad_norm": 5.1193461418151855, "learning_rate": 7.311589229888083e-07, "loss": 0.1624, "step": 7000 }, { "epoch": 5.478395061728395, "grad_norm": 1.735378384590149, "learning_rate": 5.55956375076332e-07, "loss": 0.1688, "step": 7100 }, { "epoch": 5.555555555555555, "grad_norm": 3.7359869480133057, "learning_rate": 4.043269413026429e-07, "loss": 0.148, "step": 7200 }, { "epoch": 5.632716049382716, "grad_norm": 6.915912628173828, "learning_rate": 2.7651808546956646e-07, "loss": 0.1822, "step": 7300 }, { "epoch": 5.709876543209877, "grad_norm": 5.2128448486328125, "learning_rate": 1.727383954784373e-07, "loss": 0.163, "step": 7400 }, { "epoch": 5.787037037037037, "grad_norm": 3.4656639099121094, "learning_rate": 9.315724290836047e-08, "loss": 0.1716, "step": 7500 }, { "epoch": 5.864197530864198, "grad_norm": 21.396251678466797, "learning_rate": 3.790450659670097e-08, "loss": 0.1694, "step": 7600 }, { "epoch": 5.9413580246913575, "grad_norm": 14.200843811035156, "learning_rate": 7.070360672907228e-09, "loss": 0.1618, "step": 7700 }, { "epoch": 6.0, "eval_exact_match": 64.0840652446675, "eval_f1": 83.12314115625247, "eval_runtime": 45.7565, "eval_samples_per_second": 69.673, "eval_steps_per_second": 1.093, "step": 7776 }, { "epoch": 6.0, "step": 7776, "total_flos": 2.0394634246921464e+16, "train_loss": 0.5145930189164087, "train_runtime": 3736.1381, "train_samples_per_second": 33.293, "train_steps_per_second": 2.081 } ], "logging_steps": 100, "max_steps": 7776, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0394634246921464e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }