{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.99889339727038, "eval_steps": 50, "global_step": 677, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.85936797, "epoch": 0.0014754703061600886, "grad_norm": 7.874454151515785, "learning_rate": 0.0, "loss": 0.68658942, "memory(GiB)": 24.89, "step": 1, "train_speed(iter/s)": 0.03037 }, { "acc": 0.84321463, "epoch": 0.0073773515308004425, "grad_norm": 8.79654818500605, "learning_rate": 7.628557760232497e-07, "loss": 0.79017758, "memory(GiB)": 31.87, "step": 5, "train_speed(iter/s)": 0.092709 }, { "acc": 0.85256624, "epoch": 0.014754703061600885, "grad_norm": 8.005772072681205, "learning_rate": 1.0913998759473501e-06, "loss": 0.70760584, "memory(GiB)": 33.75, "step": 10, "train_speed(iter/s)": 0.120868 }, { "acc": 0.85825052, "epoch": 0.022132054592401328, "grad_norm": 4.861872738410458, "learning_rate": 1.2835858542361333e-06, "loss": 0.64002485, "memory(GiB)": 33.01, "step": 15, "train_speed(iter/s)": 0.137764 }, { "acc": 0.8677763, "epoch": 0.02950940612320177, "grad_norm": 2.624090927434735, "learning_rate": 1.4199439758714505e-06, "loss": 0.5428031, "memory(GiB)": 34.84, "step": 20, "train_speed(iter/s)": 0.148523 }, { "acc": 0.88262272, "epoch": 0.03688675765400221, "grad_norm": 2.2979293864903276, "learning_rate": 1.5257115520464994e-06, "loss": 0.45293074, "memory(GiB)": 31.42, "step": 25, "train_speed(iter/s)": 0.152816 }, { "acc": 0.88684368, "epoch": 0.044264109184802655, "grad_norm": 2.321279166108657, "learning_rate": 1.6121299541602339e-06, "loss": 0.44487882, "memory(GiB)": 34.17, "step": 30, "train_speed(iter/s)": 0.158226 }, { "acc": 0.88785019, "epoch": 0.0516414607156031, "grad_norm": 1.6462078924259171, "learning_rate": 1.6851956720581583e-06, "loss": 0.42431307, "memory(GiB)": 33.89, "step": 35, "train_speed(iter/s)": 0.160915 }, { "acc": 0.88771706, "epoch": 0.05901881224640354, "grad_norm": 2.0535907435541323, "learning_rate": 1.7484880757955508e-06, "loss": 0.41692309, "memory(GiB)": 33.45, "step": 40, "train_speed(iter/s)": 0.162212 }, { "acc": 0.89934006, "epoch": 0.06639616377720399, "grad_norm": 1.880024272875225, "learning_rate": 1.8043159324490168e-06, "loss": 0.37824535, "memory(GiB)": 32.49, "step": 45, "train_speed(iter/s)": 0.164895 }, { "acc": 0.89317064, "epoch": 0.07377351530800443, "grad_norm": 2.4862794709135483, "learning_rate": 1.8542556519706e-06, "loss": 0.39434323, "memory(GiB)": 31.37, "step": 50, "train_speed(iter/s)": 0.166039 }, { "epoch": 0.07377351530800443, "eval_acc": 0.8897788969852836, "eval_loss": 0.3586576581001282, "eval_runtime": 9.1458, "eval_samples_per_second": 23.836, "eval_steps_per_second": 3.062, "step": 50 }, { "acc": 0.90738754, "epoch": 0.08115086683880487, "grad_norm": 1.818011862869067, "learning_rate": 1.8994316234174147e-06, "loss": 0.34018734, "memory(GiB)": 43.99, "step": 55, "train_speed(iter/s)": 0.163069 }, { "acc": 0.89877386, "epoch": 0.08852821836960531, "grad_norm": 2.769061395622785, "learning_rate": 1.940674054084334e-06, "loss": 0.3834722, "memory(GiB)": 33.18, "step": 60, "train_speed(iter/s)": 0.163587 }, { "acc": 0.89560518, "epoch": 0.09590556990040576, "grad_norm": 3.0254291124967776, "learning_rate": 1.9786134125433064e-06, "loss": 0.40774279, "memory(GiB)": 36.96, "step": 65, "train_speed(iter/s)": 0.163438 }, { "acc": 0.90745316, "epoch": 0.1032829214312062, "grad_norm": 1.9702664127406297, "learning_rate": 1.998444790046656e-06, "loss": 0.34646974, "memory(GiB)": 33.91, "step": 70, "train_speed(iter/s)": 0.165839 }, { "acc": 0.90453644, "epoch": 0.11066027296200664, "grad_norm": 1.956498769069037, "learning_rate": 1.990668740279938e-06, "loss": 0.34771657, "memory(GiB)": 32.4, "step": 75, "train_speed(iter/s)": 0.166283 }, { "acc": 0.90620461, "epoch": 0.11803762449280708, "grad_norm": 1.7929520466502804, "learning_rate": 1.9828926905132194e-06, "loss": 0.34979777, "memory(GiB)": 32.69, "step": 80, "train_speed(iter/s)": 0.166045 }, { "acc": 0.90826426, "epoch": 0.12541497602360752, "grad_norm": 2.255532399806791, "learning_rate": 1.975116640746501e-06, "loss": 0.34021211, "memory(GiB)": 32.39, "step": 85, "train_speed(iter/s)": 0.16736 }, { "acc": 0.90400352, "epoch": 0.13279232755440798, "grad_norm": 1.606426887028717, "learning_rate": 1.9673405909797823e-06, "loss": 0.3593976, "memory(GiB)": 33.28, "step": 90, "train_speed(iter/s)": 0.166086 }, { "acc": 0.90273075, "epoch": 0.14016967908520842, "grad_norm": 1.7550090784719037, "learning_rate": 1.959564541213064e-06, "loss": 0.34527693, "memory(GiB)": 32.74, "step": 95, "train_speed(iter/s)": 0.167937 }, { "acc": 0.90631161, "epoch": 0.14754703061600885, "grad_norm": 2.151177976553762, "learning_rate": 1.9517884914463452e-06, "loss": 0.34601164, "memory(GiB)": 34.44, "step": 100, "train_speed(iter/s)": 0.167745 }, { "epoch": 0.14754703061600885, "eval_acc": 0.8985658665523646, "eval_loss": 0.3217943012714386, "eval_runtime": 9.0118, "eval_samples_per_second": 24.19, "eval_steps_per_second": 3.107, "step": 100 }, { "acc": 0.90445766, "epoch": 0.1549243821468093, "grad_norm": 2.0562867995030527, "learning_rate": 1.9440124416796267e-06, "loss": 0.34789481, "memory(GiB)": 42.9, "step": 105, "train_speed(iter/s)": 0.164588 }, { "acc": 0.90358963, "epoch": 0.16230173367760975, "grad_norm": 1.8705476431194374, "learning_rate": 1.936236391912908e-06, "loss": 0.34220786, "memory(GiB)": 31.78, "step": 110, "train_speed(iter/s)": 0.165873 }, { "acc": 0.9085845, "epoch": 0.16967908520841019, "grad_norm": 1.8278699994168497, "learning_rate": 1.9284603421461896e-06, "loss": 0.3233917, "memory(GiB)": 31.86, "step": 115, "train_speed(iter/s)": 0.16598 }, { "acc": 0.90997429, "epoch": 0.17705643673921062, "grad_norm": 1.945716912044592, "learning_rate": 1.920684292379471e-06, "loss": 0.34307232, "memory(GiB)": 35.12, "step": 120, "train_speed(iter/s)": 0.166556 }, { "acc": 0.91014824, "epoch": 0.18443378827001106, "grad_norm": 1.7135397704667659, "learning_rate": 1.912908242612753e-06, "loss": 0.32152495, "memory(GiB)": 35.65, "step": 125, "train_speed(iter/s)": 0.167431 }, { "acc": 0.9074892, "epoch": 0.19181113980081152, "grad_norm": 1.7116721779311537, "learning_rate": 1.9051321928460342e-06, "loss": 0.32937753, "memory(GiB)": 33.19, "step": 130, "train_speed(iter/s)": 0.167152 }, { "acc": 0.90999937, "epoch": 0.19918849133161196, "grad_norm": 1.6389355962957932, "learning_rate": 1.8973561430793156e-06, "loss": 0.33004179, "memory(GiB)": 33.36, "step": 135, "train_speed(iter/s)": 0.168049 }, { "acc": 0.9056819, "epoch": 0.2065658428624124, "grad_norm": 1.618401896535921, "learning_rate": 1.889580093312597e-06, "loss": 0.32887373, "memory(GiB)": 31.72, "step": 140, "train_speed(iter/s)": 0.167987 }, { "acc": 0.90799198, "epoch": 0.21394319439321283, "grad_norm": 2.0697336354422076, "learning_rate": 1.8818040435458787e-06, "loss": 0.33212447, "memory(GiB)": 32.61, "step": 145, "train_speed(iter/s)": 0.168358 }, { "acc": 0.89975605, "epoch": 0.2213205459240133, "grad_norm": 1.645561918074026, "learning_rate": 1.8740279937791602e-06, "loss": 0.35846872, "memory(GiB)": 32.3, "step": 150, "train_speed(iter/s)": 0.169041 }, { "epoch": 0.2213205459240133, "eval_acc": 0.9009412058865552, "eval_loss": 0.31137242913246155, "eval_runtime": 8.9003, "eval_samples_per_second": 24.494, "eval_steps_per_second": 3.146, "step": 150 }, { "acc": 0.90751858, "epoch": 0.22869789745481373, "grad_norm": 1.717914687308357, "learning_rate": 1.8662519440124416e-06, "loss": 0.33635845, "memory(GiB)": 43.6, "step": 155, "train_speed(iter/s)": 0.167082 }, { "acc": 0.90450516, "epoch": 0.23607524898561416, "grad_norm": 1.6863266349964434, "learning_rate": 1.858475894245723e-06, "loss": 0.35405197, "memory(GiB)": 33.81, "step": 160, "train_speed(iter/s)": 0.167855 }, { "acc": 0.90395164, "epoch": 0.2434526005164146, "grad_norm": 2.1013428529714906, "learning_rate": 1.8506998444790045e-06, "loss": 0.34658258, "memory(GiB)": 32.9, "step": 165, "train_speed(iter/s)": 0.167867 }, { "acc": 0.91127558, "epoch": 0.25082995204721503, "grad_norm": 1.6631238092162342, "learning_rate": 1.842923794712286e-06, "loss": 0.32777104, "memory(GiB)": 33.53, "step": 170, "train_speed(iter/s)": 0.168028 }, { "acc": 0.90831413, "epoch": 0.25820730357801547, "grad_norm": 2.0857884493375756, "learning_rate": 1.8351477449455676e-06, "loss": 0.32164063, "memory(GiB)": 32.03, "step": 175, "train_speed(iter/s)": 0.169138 }, { "acc": 0.91539364, "epoch": 0.26558465510881596, "grad_norm": 2.0145344122511095, "learning_rate": 1.827371695178849e-06, "loss": 0.30975475, "memory(GiB)": 34.31, "step": 180, "train_speed(iter/s)": 0.168973 }, { "acc": 0.9064558, "epoch": 0.2729620066396164, "grad_norm": 1.6651879684580124, "learning_rate": 1.8195956454121305e-06, "loss": 0.3413609, "memory(GiB)": 32.63, "step": 185, "train_speed(iter/s)": 0.169312 }, { "acc": 0.90828686, "epoch": 0.28033935817041683, "grad_norm": 2.3469960245148056, "learning_rate": 1.811819595645412e-06, "loss": 0.32660947, "memory(GiB)": 33.41, "step": 190, "train_speed(iter/s)": 0.169856 }, { "acc": 0.91549397, "epoch": 0.28771670970121727, "grad_norm": 2.1806025367886117, "learning_rate": 1.8040435458786937e-06, "loss": 0.30616875, "memory(GiB)": 36.24, "step": 195, "train_speed(iter/s)": 0.169761 }, { "acc": 0.90924969, "epoch": 0.2950940612320177, "grad_norm": 1.5587292681869693, "learning_rate": 1.7962674961119751e-06, "loss": 0.32027857, "memory(GiB)": 32.62, "step": 200, "train_speed(iter/s)": 0.170581 }, { "epoch": 0.2950940612320177, "eval_acc": 0.901896699528504, "eval_loss": 0.3015853464603424, "eval_runtime": 9.0231, "eval_samples_per_second": 24.16, "eval_steps_per_second": 3.103, "step": 200 }, { "acc": 0.91348085, "epoch": 0.30247141276281814, "grad_norm": 1.7818986098446097, "learning_rate": 1.7884914463452566e-06, "loss": 0.30208986, "memory(GiB)": 44.06, "step": 205, "train_speed(iter/s)": 0.169194 }, { "acc": 0.90921364, "epoch": 0.3098487642936186, "grad_norm": 4.02077354284952, "learning_rate": 1.780715396578538e-06, "loss": 0.31497798, "memory(GiB)": 34.58, "step": 210, "train_speed(iter/s)": 0.169003 }, { "acc": 0.91234264, "epoch": 0.317226115824419, "grad_norm": 1.856976113207096, "learning_rate": 1.7729393468118195e-06, "loss": 0.30694566, "memory(GiB)": 33.8, "step": 215, "train_speed(iter/s)": 0.16984 }, { "acc": 0.91051998, "epoch": 0.3246034673552195, "grad_norm": 1.7185168230569432, "learning_rate": 1.765163297045101e-06, "loss": 0.30961909, "memory(GiB)": 32.79, "step": 220, "train_speed(iter/s)": 0.169666 }, { "acc": 0.90716095, "epoch": 0.33198081888601993, "grad_norm": 1.340608010048739, "learning_rate": 1.7573872472783826e-06, "loss": 0.32777991, "memory(GiB)": 32.43, "step": 225, "train_speed(iter/s)": 0.169965 }, { "acc": 0.91547451, "epoch": 0.33935817041682037, "grad_norm": 1.6059763623857688, "learning_rate": 1.749611197511664e-06, "loss": 0.30423913, "memory(GiB)": 34.95, "step": 230, "train_speed(iter/s)": 0.169935 }, { "acc": 0.917132, "epoch": 0.3467355219476208, "grad_norm": 2.0390121908637644, "learning_rate": 1.7418351477449455e-06, "loss": 0.30788417, "memory(GiB)": 34.18, "step": 235, "train_speed(iter/s)": 0.169583 }, { "acc": 0.92253389, "epoch": 0.35411287347842124, "grad_norm": 1.7323441045370742, "learning_rate": 1.734059097978227e-06, "loss": 0.27823753, "memory(GiB)": 31.85, "step": 240, "train_speed(iter/s)": 0.17024 }, { "acc": 0.91325512, "epoch": 0.3614902250092217, "grad_norm": 1.6955182367729624, "learning_rate": 1.7262830482115086e-06, "loss": 0.31402481, "memory(GiB)": 32.14, "step": 245, "train_speed(iter/s)": 0.169973 }, { "acc": 0.91568565, "epoch": 0.3688675765400221, "grad_norm": 1.5212817841417117, "learning_rate": 1.71850699844479e-06, "loss": 0.29354782, "memory(GiB)": 33.28, "step": 250, "train_speed(iter/s)": 0.169891 }, { "epoch": 0.3688675765400221, "eval_acc": 0.903888055436491, "eval_loss": 0.2949393689632416, "eval_runtime": 8.8569, "eval_samples_per_second": 24.614, "eval_steps_per_second": 3.161, "step": 250 }, { "acc": 0.91542091, "epoch": 0.37624492807082255, "grad_norm": 1.872512089057089, "learning_rate": 1.7107309486780715e-06, "loss": 0.29765024, "memory(GiB)": 43.8, "step": 255, "train_speed(iter/s)": 0.169287 }, { "acc": 0.90894642, "epoch": 0.38362227960162304, "grad_norm": 2.118992381164901, "learning_rate": 1.702954898911353e-06, "loss": 0.32009149, "memory(GiB)": 33.0, "step": 260, "train_speed(iter/s)": 0.169108 }, { "acc": 0.91895199, "epoch": 0.3909996311324235, "grad_norm": 1.8087446200238866, "learning_rate": 1.6951788491446344e-06, "loss": 0.28518291, "memory(GiB)": 33.64, "step": 265, "train_speed(iter/s)": 0.169659 }, { "acc": 0.91831837, "epoch": 0.3983769826632239, "grad_norm": 2.295227865477349, "learning_rate": 1.6874027993779158e-06, "loss": 0.29493954, "memory(GiB)": 32.16, "step": 270, "train_speed(iter/s)": 0.16921 }, { "acc": 0.91772842, "epoch": 0.40575433419402435, "grad_norm": 1.8335936104899577, "learning_rate": 1.6796267496111975e-06, "loss": 0.29295368, "memory(GiB)": 32.48, "step": 275, "train_speed(iter/s)": 0.169211 }, { "acc": 0.9184288, "epoch": 0.4131316857248248, "grad_norm": 1.9183997806679902, "learning_rate": 1.671850699844479e-06, "loss": 0.29449196, "memory(GiB)": 32.65, "step": 280, "train_speed(iter/s)": 0.169821 }, { "acc": 0.91275759, "epoch": 0.4205090372556252, "grad_norm": 1.5737005817463792, "learning_rate": 1.6640746500777604e-06, "loss": 0.30824404, "memory(GiB)": 32.27, "step": 285, "train_speed(iter/s)": 0.169618 }, { "acc": 0.91761837, "epoch": 0.42788638878642565, "grad_norm": 1.6411868652328097, "learning_rate": 1.6562986003110419e-06, "loss": 0.28589807, "memory(GiB)": 33.9, "step": 290, "train_speed(iter/s)": 0.16978 }, { "acc": 0.91096239, "epoch": 0.4352637403172261, "grad_norm": 1.4763719992796571, "learning_rate": 1.6485225505443235e-06, "loss": 0.31501875, "memory(GiB)": 33.9, "step": 295, "train_speed(iter/s)": 0.170116 }, { "acc": 0.92102461, "epoch": 0.4426410918480266, "grad_norm": 1.7038633862826587, "learning_rate": 1.640746500777605e-06, "loss": 0.28700156, "memory(GiB)": 33.12, "step": 300, "train_speed(iter/s)": 0.16999 }, { "epoch": 0.4426410918480266, "eval_acc": 0.904986426632376, "eval_loss": 0.28871360421180725, "eval_runtime": 8.8172, "eval_samples_per_second": 24.724, "eval_steps_per_second": 3.176, "step": 300 }, { "acc": 0.9137413, "epoch": 0.450018443378827, "grad_norm": 1.5572757830459178, "learning_rate": 1.6329704510108864e-06, "loss": 0.3066596, "memory(GiB)": 44.77, "step": 305, "train_speed(iter/s)": 0.169643 }, { "acc": 0.92225361, "epoch": 0.45739579490962745, "grad_norm": 1.7973596806557957, "learning_rate": 1.6251944012441679e-06, "loss": 0.28060098, "memory(GiB)": 34.38, "step": 310, "train_speed(iter/s)": 0.169469 }, { "acc": 0.91542816, "epoch": 0.4647731464404279, "grad_norm": 1.7774091029439925, "learning_rate": 1.6174183514774493e-06, "loss": 0.29976537, "memory(GiB)": 33.81, "step": 315, "train_speed(iter/s)": 0.169523 }, { "acc": 0.91291943, "epoch": 0.4721504979712283, "grad_norm": 1.3755306649838441, "learning_rate": 1.6096423017107308e-06, "loss": 0.30613976, "memory(GiB)": 33.81, "step": 320, "train_speed(iter/s)": 0.169769 }, { "acc": 0.90916691, "epoch": 0.47952784950202876, "grad_norm": 1.9213831375809023, "learning_rate": 1.6018662519440122e-06, "loss": 0.32510529, "memory(GiB)": 34.44, "step": 325, "train_speed(iter/s)": 0.169545 }, { "acc": 0.91636696, "epoch": 0.4869052010328292, "grad_norm": 1.8837685149781478, "learning_rate": 1.5940902021772939e-06, "loss": 0.30537646, "memory(GiB)": 31.2, "step": 330, "train_speed(iter/s)": 0.170038 }, { "acc": 0.91307325, "epoch": 0.4942825525636297, "grad_norm": 1.8595782698159422, "learning_rate": 1.5863141524105753e-06, "loss": 0.30300996, "memory(GiB)": 30.74, "step": 335, "train_speed(iter/s)": 0.169983 }, { "acc": 0.91927223, "epoch": 0.5016599040944301, "grad_norm": 1.8693944311229003, "learning_rate": 1.5785381026438568e-06, "loss": 0.28294766, "memory(GiB)": 31.5, "step": 340, "train_speed(iter/s)": 0.170169 }, { "acc": 0.92018118, "epoch": 0.5090372556252305, "grad_norm": 1.6240951695142463, "learning_rate": 1.5707620528771385e-06, "loss": 0.27536349, "memory(GiB)": 32.84, "step": 345, "train_speed(iter/s)": 0.170494 }, { "acc": 0.91428967, "epoch": 0.5164146071560309, "grad_norm": 2.0654305075288653, "learning_rate": 1.56298600311042e-06, "loss": 0.30193062, "memory(GiB)": 33.88, "step": 350, "train_speed(iter/s)": 0.170499 }, { "epoch": 0.5164146071560309, "eval_acc": 0.906031218745535, "eval_loss": 0.2829771637916565, "eval_runtime": 8.9252, "eval_samples_per_second": 24.425, "eval_steps_per_second": 3.137, "step": 350 }, { "acc": 0.92116051, "epoch": 0.5237919586868315, "grad_norm": 2.2709862324112136, "learning_rate": 1.5552099533437014e-06, "loss": 0.277144, "memory(GiB)": 44.05, "step": 355, "train_speed(iter/s)": 0.169773 }, { "acc": 0.90278854, "epoch": 0.5311693102176319, "grad_norm": 1.9738153042801483, "learning_rate": 1.5474339035769828e-06, "loss": 0.33822517, "memory(GiB)": 31.78, "step": 360, "train_speed(iter/s)": 0.170163 }, { "acc": 0.92497654, "epoch": 0.5385466617484324, "grad_norm": 1.2430005126419985, "learning_rate": 1.5396578538102643e-06, "loss": 0.26646669, "memory(GiB)": 33.8, "step": 365, "train_speed(iter/s)": 0.16992 }, { "acc": 0.91328669, "epoch": 0.5459240132792328, "grad_norm": 1.732568460701246, "learning_rate": 1.5318818040435457e-06, "loss": 0.30124869, "memory(GiB)": 34.07, "step": 370, "train_speed(iter/s)": 0.170382 }, { "acc": 0.91603355, "epoch": 0.5533013648100332, "grad_norm": 1.6627563648419381, "learning_rate": 1.5241057542768272e-06, "loss": 0.29759171, "memory(GiB)": 32.61, "step": 375, "train_speed(iter/s)": 0.170197 }, { "acc": 0.90871716, "epoch": 0.5606787163408337, "grad_norm": 2.1331488669107492, "learning_rate": 1.5163297045101088e-06, "loss": 0.33630853, "memory(GiB)": 32.33, "step": 380, "train_speed(iter/s)": 0.17029 }, { "acc": 0.90700073, "epoch": 0.5680560678716341, "grad_norm": 2.080763753555995, "learning_rate": 1.5085536547433903e-06, "loss": 0.325877, "memory(GiB)": 32.95, "step": 385, "train_speed(iter/s)": 0.170474 }, { "acc": 0.91835623, "epoch": 0.5754334194024345, "grad_norm": 1.5911495384236254, "learning_rate": 1.500777604976672e-06, "loss": 0.28332872, "memory(GiB)": 31.78, "step": 390, "train_speed(iter/s)": 0.170283 }, { "acc": 0.91712914, "epoch": 0.582810770933235, "grad_norm": 1.6237776507352246, "learning_rate": 1.4930015552099534e-06, "loss": 0.28782868, "memory(GiB)": 33.13, "step": 395, "train_speed(iter/s)": 0.170424 }, { "acc": 0.92452984, "epoch": 0.5901881224640354, "grad_norm": 1.9617693211652296, "learning_rate": 1.4852255054432348e-06, "loss": 0.25721183, "memory(GiB)": 34.52, "step": 400, "train_speed(iter/s)": 0.170549 }, { "epoch": 0.5901881224640354, "eval_acc": 0.9067634662094585, "eval_loss": 0.27780693769454956, "eval_runtime": 8.9713, "eval_samples_per_second": 24.3, "eval_steps_per_second": 3.121, "step": 400 }, { "acc": 0.91402645, "epoch": 0.5975654739948358, "grad_norm": 1.6283342820719429, "learning_rate": 1.4774494556765163e-06, "loss": 0.29935551, "memory(GiB)": 43.79, "step": 405, "train_speed(iter/s)": 0.169655 }, { "acc": 0.91232147, "epoch": 0.6049428255256363, "grad_norm": 1.7979698219270268, "learning_rate": 1.4696734059097977e-06, "loss": 0.29618566, "memory(GiB)": 34.75, "step": 410, "train_speed(iter/s)": 0.169867 }, { "acc": 0.91495514, "epoch": 0.6123201770564367, "grad_norm": 1.400313093548897, "learning_rate": 1.4618973561430792e-06, "loss": 0.30076814, "memory(GiB)": 33.36, "step": 415, "train_speed(iter/s)": 0.169686 }, { "acc": 0.91793385, "epoch": 0.6196975285872371, "grad_norm": 1.5440217170439645, "learning_rate": 1.4541213063763606e-06, "loss": 0.27723732, "memory(GiB)": 32.03, "step": 420, "train_speed(iter/s)": 0.169706 }, { "acc": 0.92025652, "epoch": 0.6270748801180376, "grad_norm": 1.7171089334482643, "learning_rate": 1.446345256609642e-06, "loss": 0.28218346, "memory(GiB)": 31.84, "step": 425, "train_speed(iter/s)": 0.169824 }, { "acc": 0.91456184, "epoch": 0.634452231648838, "grad_norm": 1.7617810648771757, "learning_rate": 1.4385692068429238e-06, "loss": 0.30232787, "memory(GiB)": 33.01, "step": 430, "train_speed(iter/s)": 0.169549 }, { "acc": 0.91554451, "epoch": 0.6418295831796386, "grad_norm": 2.1102714988825966, "learning_rate": 1.4307931570762052e-06, "loss": 0.29879627, "memory(GiB)": 33.18, "step": 435, "train_speed(iter/s)": 0.169677 }, { "acc": 0.92126179, "epoch": 0.649206934710439, "grad_norm": 2.046949703950944, "learning_rate": 1.4230171073094869e-06, "loss": 0.27905126, "memory(GiB)": 35.07, "step": 440, "train_speed(iter/s)": 0.169605 }, { "acc": 0.90152893, "epoch": 0.6565842862412394, "grad_norm": 2.001971595085909, "learning_rate": 1.4152410575427683e-06, "loss": 0.34060516, "memory(GiB)": 33.51, "step": 445, "train_speed(iter/s)": 0.169689 }, { "acc": 0.91629639, "epoch": 0.6639616377720399, "grad_norm": 2.0397672790155528, "learning_rate": 1.4074650077760498e-06, "loss": 0.28595252, "memory(GiB)": 34.12, "step": 450, "train_speed(iter/s)": 0.170047 }, { "epoch": 0.6639616377720399, "eval_acc": 0.9078082583226175, "eval_loss": 0.2715848386287689, "eval_runtime": 8.8964, "eval_samples_per_second": 24.504, "eval_steps_per_second": 3.147, "step": 450 }, { "acc": 0.92627125, "epoch": 0.6713389893028403, "grad_norm": 1.6378143906534044, "learning_rate": 1.3996889580093312e-06, "loss": 0.25918436, "memory(GiB)": 43.88, "step": 455, "train_speed(iter/s)": 0.169369 }, { "acc": 0.91979427, "epoch": 0.6787163408336407, "grad_norm": 1.7082862687854972, "learning_rate": 1.3919129082426127e-06, "loss": 0.27077117, "memory(GiB)": 32.33, "step": 460, "train_speed(iter/s)": 0.169438 }, { "acc": 0.91361713, "epoch": 0.6860936923644412, "grad_norm": 2.293000555161464, "learning_rate": 1.3841368584758941e-06, "loss": 0.30449131, "memory(GiB)": 32.93, "step": 465, "train_speed(iter/s)": 0.169581 }, { "acc": 0.91954422, "epoch": 0.6934710438952416, "grad_norm": 1.8478883729217541, "learning_rate": 1.3763608087091756e-06, "loss": 0.29147563, "memory(GiB)": 32.32, "step": 470, "train_speed(iter/s)": 0.169425 }, { "acc": 0.91925821, "epoch": 0.700848395426042, "grad_norm": 2.1771276083255833, "learning_rate": 1.368584758942457e-06, "loss": 0.27578421, "memory(GiB)": 31.55, "step": 475, "train_speed(iter/s)": 0.169717 }, { "acc": 0.91978226, "epoch": 0.7082257469568425, "grad_norm": 1.5525703471804124, "learning_rate": 1.3608087091757387e-06, "loss": 0.28457327, "memory(GiB)": 34.35, "step": 480, "train_speed(iter/s)": 0.169473 }, { "acc": 0.91358566, "epoch": 0.7156030984876429, "grad_norm": 1.6094545899681876, "learning_rate": 1.3530326594090201e-06, "loss": 0.29641771, "memory(GiB)": 34.35, "step": 485, "train_speed(iter/s)": 0.169292 }, { "acc": 0.9157114, "epoch": 0.7229804500184434, "grad_norm": 2.001462148706446, "learning_rate": 1.3452566096423018e-06, "loss": 0.30091541, "memory(GiB)": 33.0, "step": 490, "train_speed(iter/s)": 0.169539 }, { "acc": 0.9181448, "epoch": 0.7303578015492438, "grad_norm": 1.933852376850104, "learning_rate": 1.3374805598755833e-06, "loss": 0.28622799, "memory(GiB)": 31.96, "step": 495, "train_speed(iter/s)": 0.169315 }, { "acc": 0.91473122, "epoch": 0.7377351530800442, "grad_norm": 1.9036456322193762, "learning_rate": 1.3297045101088647e-06, "loss": 0.3094301, "memory(GiB)": 31.84, "step": 500, "train_speed(iter/s)": 0.169482 }, { "epoch": 0.7377351530800442, "eval_acc": 0.9090048578368338, "eval_loss": 0.2688305675983429, "eval_runtime": 8.8274, "eval_samples_per_second": 24.696, "eval_steps_per_second": 3.172, "step": 500 }, { "acc": 0.91458435, "epoch": 0.7451125046108447, "grad_norm": 1.9335752594206985, "learning_rate": 1.3219284603421462e-06, "loss": 0.29494238, "memory(GiB)": 43.4, "step": 505, "train_speed(iter/s)": 0.168821 }, { "acc": 0.9221386, "epoch": 0.7524898561416451, "grad_norm": 1.8197097143608403, "learning_rate": 1.3141524105754276e-06, "loss": 0.2647439, "memory(GiB)": 33.36, "step": 510, "train_speed(iter/s)": 0.168682 }, { "acc": 0.92193203, "epoch": 0.7598672076724456, "grad_norm": 1.901554742963865, "learning_rate": 1.306376360808709e-06, "loss": 0.27191839, "memory(GiB)": 30.47, "step": 515, "train_speed(iter/s)": 0.168924 }, { "acc": 0.91413088, "epoch": 0.7672445592032461, "grad_norm": 2.0670792917636236, "learning_rate": 1.2986003110419905e-06, "loss": 0.296503, "memory(GiB)": 32.43, "step": 520, "train_speed(iter/s)": 0.168732 }, { "acc": 0.92014456, "epoch": 0.7746219107340465, "grad_norm": 1.3940992355499904, "learning_rate": 1.290824261275272e-06, "loss": 0.27345006, "memory(GiB)": 31.88, "step": 525, "train_speed(iter/s)": 0.168564 }, { "acc": 0.91787033, "epoch": 0.781999262264847, "grad_norm": 1.7528498159038246, "learning_rate": 1.2830482115085536e-06, "loss": 0.27718287, "memory(GiB)": 32.83, "step": 530, "train_speed(iter/s)": 0.168633 }, { "acc": 0.91950254, "epoch": 0.7893766137956474, "grad_norm": 1.6045395248629215, "learning_rate": 1.275272161741835e-06, "loss": 0.27553134, "memory(GiB)": 30.99, "step": 535, "train_speed(iter/s)": 0.168504 }, { "acc": 0.91442375, "epoch": 0.7967539653264478, "grad_norm": 2.0480557410695686, "learning_rate": 1.2674961119751167e-06, "loss": 0.29672928, "memory(GiB)": 32.9, "step": 540, "train_speed(iter/s)": 0.168746 }, { "acc": 0.91783228, "epoch": 0.8041313168572483, "grad_norm": 1.7063380836356228, "learning_rate": 1.2597200622083982e-06, "loss": 0.28551073, "memory(GiB)": 32.64, "step": 545, "train_speed(iter/s)": 0.168632 }, { "acc": 0.91965294, "epoch": 0.8115086683880487, "grad_norm": 1.8091430299196016, "learning_rate": 1.2519440124416796e-06, "loss": 0.28367462, "memory(GiB)": 33.12, "step": 550, "train_speed(iter/s)": 0.168537 }, { "epoch": 0.8115086683880487, "eval_acc": 0.9094959994284898, "eval_loss": 0.265609472990036, "eval_runtime": 8.9354, "eval_samples_per_second": 24.397, "eval_steps_per_second": 3.134, "step": 550 }, { "acc": 0.91708422, "epoch": 0.8188860199188491, "grad_norm": 1.9338041082162762, "learning_rate": 1.244167962674961e-06, "loss": 0.30288501, "memory(GiB)": 44.46, "step": 555, "train_speed(iter/s)": 0.168246 }, { "acc": 0.91793032, "epoch": 0.8262633714496496, "grad_norm": 1.960186880981984, "learning_rate": 1.2363919129082425e-06, "loss": 0.29391913, "memory(GiB)": 33.02, "step": 560, "train_speed(iter/s)": 0.168119 }, { "acc": 0.92976294, "epoch": 0.83364072298045, "grad_norm": 1.7220525036525174, "learning_rate": 1.228615863141524e-06, "loss": 0.24753182, "memory(GiB)": 32.77, "step": 565, "train_speed(iter/s)": 0.16819 }, { "acc": 0.9202878, "epoch": 0.8410180745112504, "grad_norm": 1.9681280144249207, "learning_rate": 1.2208398133748054e-06, "loss": 0.27648234, "memory(GiB)": 32.36, "step": 570, "train_speed(iter/s)": 0.168331 }, { "acc": 0.91870079, "epoch": 0.8483954260420509, "grad_norm": 1.6402903494642216, "learning_rate": 1.2130637636080869e-06, "loss": 0.29140263, "memory(GiB)": 35.18, "step": 575, "train_speed(iter/s)": 0.168255 }, { "acc": 0.91364193, "epoch": 0.8557727775728513, "grad_norm": 2.146651599757078, "learning_rate": 1.2052877138413686e-06, "loss": 0.31224487, "memory(GiB)": 37.43, "step": 580, "train_speed(iter/s)": 0.168463 }, { "acc": 0.92091951, "epoch": 0.8631501291036517, "grad_norm": 2.110687395796676, "learning_rate": 1.19751166407465e-06, "loss": 0.27074888, "memory(GiB)": 30.34, "step": 585, "train_speed(iter/s)": 0.16837 }, { "acc": 0.92361298, "epoch": 0.8705274806344522, "grad_norm": 1.341809177582426, "learning_rate": 1.1897356143079317e-06, "loss": 0.26371779, "memory(GiB)": 32.35, "step": 590, "train_speed(iter/s)": 0.168375 }, { "acc": 0.92123985, "epoch": 0.8779048321652527, "grad_norm": 1.8270563745834436, "learning_rate": 1.1819595645412131e-06, "loss": 0.26702247, "memory(GiB)": 34.77, "step": 595, "train_speed(iter/s)": 0.168532 }, { "acc": 0.91653709, "epoch": 0.8852821836960532, "grad_norm": 1.6527432011832037, "learning_rate": 1.1741835147744946e-06, "loss": 0.29842911, "memory(GiB)": 33.87, "step": 600, "train_speed(iter/s)": 0.168424 }, { "epoch": 0.8852821836960532, "eval_acc": 0.9105765109301329, "eval_loss": 0.2623133361339569, "eval_runtime": 8.7796, "eval_samples_per_second": 24.83, "eval_steps_per_second": 3.189, "step": 600 }, { "acc": 0.91810665, "epoch": 0.8926595352268536, "grad_norm": 1.3239706750197222, "learning_rate": 1.166407465007776e-06, "loss": 0.29543982, "memory(GiB)": 43.63, "step": 605, "train_speed(iter/s)": 0.16811 }, { "acc": 0.92373562, "epoch": 0.900036886757654, "grad_norm": 1.589090709862595, "learning_rate": 1.1586314152410575e-06, "loss": 0.27000737, "memory(GiB)": 32.08, "step": 610, "train_speed(iter/s)": 0.168111 }, { "acc": 0.92571859, "epoch": 0.9074142382884545, "grad_norm": 1.786690071917202, "learning_rate": 1.150855365474339e-06, "loss": 0.26558821, "memory(GiB)": 34.26, "step": 615, "train_speed(iter/s)": 0.167944 }, { "acc": 0.92350941, "epoch": 0.9147915898192549, "grad_norm": 1.4482760998007842, "learning_rate": 1.1430793157076204e-06, "loss": 0.27038224, "memory(GiB)": 32.87, "step": 620, "train_speed(iter/s)": 0.168075 }, { "acc": 0.92567997, "epoch": 0.9221689413500553, "grad_norm": 1.5651995631831526, "learning_rate": 1.1353032659409018e-06, "loss": 0.25891747, "memory(GiB)": 32.63, "step": 625, "train_speed(iter/s)": 0.168015 }, { "acc": 0.91823616, "epoch": 0.9295462928808558, "grad_norm": 1.4462434724962336, "learning_rate": 1.1275272161741835e-06, "loss": 0.2788033, "memory(GiB)": 38.22, "step": 630, "train_speed(iter/s)": 0.167998 }, { "acc": 0.92322083, "epoch": 0.9369236444116562, "grad_norm": 1.4194043988299254, "learning_rate": 1.119751166407465e-06, "loss": 0.26030297, "memory(GiB)": 32.29, "step": 635, "train_speed(iter/s)": 0.168162 }, { "acc": 0.92457771, "epoch": 0.9443009959424566, "grad_norm": 1.8304569462755849, "learning_rate": 1.1119751166407466e-06, "loss": 0.27183619, "memory(GiB)": 35.33, "step": 640, "train_speed(iter/s)": 0.168086 }, { "acc": 0.9201807, "epoch": 0.9516783474732571, "grad_norm": 1.6355541683467607, "learning_rate": 1.104199066874028e-06, "loss": 0.27730408, "memory(GiB)": 31.4, "step": 645, "train_speed(iter/s)": 0.168284 }, { "acc": 0.92337418, "epoch": 0.9590556990040575, "grad_norm": 1.6309155055635356, "learning_rate": 1.0964230171073095e-06, "loss": 0.25860276, "memory(GiB)": 32.67, "step": 650, "train_speed(iter/s)": 0.168267 }, { "epoch": 0.9590556990040575, "eval_acc": 0.9113176882411773, "eval_loss": 0.2569684386253357, "eval_runtime": 8.8598, "eval_samples_per_second": 24.605, "eval_steps_per_second": 3.16, "step": 650 }, { "acc": 0.91919975, "epoch": 0.966433050534858, "grad_norm": 1.482378816274918, "learning_rate": 1.088646967340591e-06, "loss": 0.28527048, "memory(GiB)": 45.59, "step": 655, "train_speed(iter/s)": 0.167772 }, { "acc": 0.92037735, "epoch": 0.9738104020656584, "grad_norm": 2.2165369625767712, "learning_rate": 1.0808709175738724e-06, "loss": 0.28198528, "memory(GiB)": 32.93, "step": 660, "train_speed(iter/s)": 0.16789 }, { "acc": 0.92200727, "epoch": 0.9811877535964588, "grad_norm": 1.7151646172394919, "learning_rate": 1.0730948678071539e-06, "loss": 0.27098572, "memory(GiB)": 33.1, "step": 665, "train_speed(iter/s)": 0.167862 }, { "acc": 0.92197828, "epoch": 0.9885651051272594, "grad_norm": 2.076606131505725, "learning_rate": 1.0653188180404353e-06, "loss": 0.26747627, "memory(GiB)": 34.45, "step": 670, "train_speed(iter/s)": 0.167945 }, { "acc": 0.92063084, "epoch": 0.9959424566580598, "grad_norm": 1.7465662806523121, "learning_rate": 1.0575427682737168e-06, "loss": 0.27087922, "memory(GiB)": 39.51, "step": 675, "train_speed(iter/s)": 0.167951 } ], "logging_steps": 5, "max_steps": 1354, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 66000591650816.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }