{ "best_metric": 0.75, "best_model_checkpoint": "2024_08_13/checkpoint-62", "epoch": 29.76, "eval_steps": 500, "global_step": 930, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.32, "grad_norm": 10.543134689331055, "learning_rate": 1.0752688172043011e-07, "loss": 0.7672, "step": 10 }, { "epoch": 0.64, "grad_norm": 3.446369171142578, "learning_rate": 2.1505376344086022e-07, "loss": 0.7681, "step": 20 }, { "epoch": 0.96, "grad_norm": 11.52937126159668, "learning_rate": 3.225806451612903e-07, "loss": 0.7191, "step": 30 }, { "epoch": 0.992, "eval_accuracy": 0.25, "eval_loss": 0.7458651065826416, "eval_runtime": 1.1044, "eval_samples_per_second": 28.975, "eval_steps_per_second": 28.975, "step": 31 }, { "epoch": 1.28, "grad_norm": 5.57147741317749, "learning_rate": 4.3010752688172043e-07, "loss": 0.7269, "step": 40 }, { "epoch": 1.6, "grad_norm": 5.154497146606445, "learning_rate": 5.376344086021505e-07, "loss": 0.7094, "step": 50 }, { "epoch": 1.92, "grad_norm": 7.2774434089660645, "learning_rate": 6.451612903225806e-07, "loss": 0.6894, "step": 60 }, { "epoch": 1.984, "eval_accuracy": 0.75, "eval_loss": 0.6786516308784485, "eval_runtime": 1.183, "eval_samples_per_second": 27.05, "eval_steps_per_second": 27.05, "step": 62 }, { "epoch": 2.24, "grad_norm": 3.891305923461914, "learning_rate": 7.526881720430107e-07, "loss": 0.6601, "step": 70 }, { "epoch": 2.56, "grad_norm": 4.829281330108643, "learning_rate": 8.602150537634409e-07, "loss": 0.6542, "step": 80 }, { "epoch": 2.88, "grad_norm": 4.894190311431885, "learning_rate": 9.67741935483871e-07, "loss": 0.5993, "step": 90 }, { "epoch": 2.976, "eval_accuracy": 0.75, "eval_loss": 0.6089950799942017, "eval_runtime": 1.1112, "eval_samples_per_second": 28.798, "eval_steps_per_second": 28.798, "step": 93 }, { "epoch": 3.2, "grad_norm": 3.572125196456909, "learning_rate": 9.91636798088411e-07, "loss": 0.5845, "step": 100 }, { "epoch": 3.52, "grad_norm": 6.358020305633545, "learning_rate": 9.79689366786141e-07, "loss": 0.5637, "step": 110 }, { "epoch": 3.84, "grad_norm": 2.5889291763305664, "learning_rate": 9.67741935483871e-07, "loss": 0.5858, "step": 120 }, { "epoch": 4.0, "eval_accuracy": 0.75, "eval_loss": 0.5701560974121094, "eval_runtime": 1.1543, "eval_samples_per_second": 27.722, "eval_steps_per_second": 27.722, "step": 125 }, { "epoch": 4.16, "grad_norm": 5.341928482055664, "learning_rate": 9.557945041816009e-07, "loss": 0.559, "step": 130 }, { "epoch": 4.48, "grad_norm": 5.699573993682861, "learning_rate": 9.438470728793309e-07, "loss": 0.4752, "step": 140 }, { "epoch": 4.8, "grad_norm": 6.19121789932251, "learning_rate": 9.318996415770609e-07, "loss": 0.5407, "step": 150 }, { "epoch": 4.992, "eval_accuracy": 0.75, "eval_loss": 0.5572408437728882, "eval_runtime": 1.1052, "eval_samples_per_second": 28.954, "eval_steps_per_second": 28.954, "step": 156 }, { "epoch": 5.12, "grad_norm": 2.574436664581299, "learning_rate": 9.199522102747909e-07, "loss": 0.6231, "step": 160 }, { "epoch": 5.44, "grad_norm": 5.261692523956299, "learning_rate": 9.080047789725208e-07, "loss": 0.4838, "step": 170 }, { "epoch": 5.76, "grad_norm": 5.266414642333984, "learning_rate": 8.960573476702509e-07, "loss": 0.6552, "step": 180 }, { "epoch": 5.984, "eval_accuracy": 0.75, "eval_loss": 0.5552529692649841, "eval_runtime": 1.1351, "eval_samples_per_second": 28.192, "eval_steps_per_second": 28.192, "step": 187 }, { "epoch": 6.08, "grad_norm": 5.3043341636657715, "learning_rate": 8.841099163679809e-07, "loss": 0.5321, "step": 190 }, { "epoch": 6.4, "grad_norm": 3.8107211589813232, "learning_rate": 8.721624850657109e-07, "loss": 0.5797, "step": 200 }, { "epoch": 6.72, "grad_norm": 13.660761833190918, "learning_rate": 8.602150537634409e-07, "loss": 0.5562, "step": 210 }, { "epoch": 6.976, "eval_accuracy": 0.75, "eval_loss": 0.552901029586792, "eval_runtime": 1.1121, "eval_samples_per_second": 28.774, "eval_steps_per_second": 28.774, "step": 218 }, { "epoch": 7.04, "grad_norm": 5.3051605224609375, "learning_rate": 8.482676224611708e-07, "loss": 0.4844, "step": 220 }, { "epoch": 7.36, "grad_norm": 3.601945400238037, "learning_rate": 8.363201911589009e-07, "loss": 0.6067, "step": 230 }, { "epoch": 7.68, "grad_norm": 5.2441229820251465, "learning_rate": 8.243727598566307e-07, "loss": 0.519, "step": 240 }, { "epoch": 8.0, "grad_norm": 8.390249252319336, "learning_rate": 8.124253285543607e-07, "loss": 0.6054, "step": 250 }, { "epoch": 8.0, "eval_accuracy": 0.75, "eval_loss": 0.5519319772720337, "eval_runtime": 1.1259, "eval_samples_per_second": 28.423, "eval_steps_per_second": 28.423, "step": 250 }, { "epoch": 8.32, "grad_norm": 4.872560024261475, "learning_rate": 8.004778972520908e-07, "loss": 0.4244, "step": 260 }, { "epoch": 8.64, "grad_norm": 4.932515621185303, "learning_rate": 7.885304659498207e-07, "loss": 0.501, "step": 270 }, { "epoch": 8.96, "grad_norm": 10.174718856811523, "learning_rate": 7.765830346475507e-07, "loss": 0.7563, "step": 280 }, { "epoch": 8.992, "eval_accuracy": 0.75, "eval_loss": 0.5517733097076416, "eval_runtime": 1.1268, "eval_samples_per_second": 28.399, "eval_steps_per_second": 28.399, "step": 281 }, { "epoch": 9.28, "grad_norm": 10.531723022460938, "learning_rate": 7.646356033452807e-07, "loss": 0.7409, "step": 290 }, { "epoch": 9.6, "grad_norm": 5.187292098999023, "learning_rate": 7.526881720430107e-07, "loss": 0.4602, "step": 300 }, { "epoch": 9.92, "grad_norm": 5.227697849273682, "learning_rate": 7.407407407407406e-07, "loss": 0.5174, "step": 310 }, { "epoch": 9.984, "eval_accuracy": 0.75, "eval_loss": 0.5523006319999695, "eval_runtime": 1.1419, "eval_samples_per_second": 28.023, "eval_steps_per_second": 28.023, "step": 312 }, { "epoch": 10.24, "grad_norm": 3.8940961360931396, "learning_rate": 7.287933094384707e-07, "loss": 0.4655, "step": 320 }, { "epoch": 10.56, "grad_norm": 9.157276153564453, "learning_rate": 7.168458781362007e-07, "loss": 0.8298, "step": 330 }, { "epoch": 10.88, "grad_norm": 6.260276794433594, "learning_rate": 7.048984468339306e-07, "loss": 0.3765, "step": 340 }, { "epoch": 10.975999999999999, "eval_accuracy": 0.75, "eval_loss": 0.5513983964920044, "eval_runtime": 1.3718, "eval_samples_per_second": 23.327, "eval_steps_per_second": 23.327, "step": 343 }, { "epoch": 11.2, "grad_norm": 6.0307416915893555, "learning_rate": 6.929510155316607e-07, "loss": 0.409, "step": 350 }, { "epoch": 11.52, "grad_norm": 7.104811191558838, "learning_rate": 6.810035842293906e-07, "loss": 0.5515, "step": 360 }, { "epoch": 11.84, "grad_norm": 1.7931679487228394, "learning_rate": 6.690561529271206e-07, "loss": 0.5727, "step": 370 }, { "epoch": 12.0, "eval_accuracy": 0.75, "eval_loss": 0.5506787300109863, "eval_runtime": 1.1592, "eval_samples_per_second": 27.606, "eval_steps_per_second": 27.606, "step": 375 }, { "epoch": 12.16, "grad_norm": 15.06185245513916, "learning_rate": 6.571087216248506e-07, "loss": 0.6646, "step": 380 }, { "epoch": 12.48, "grad_norm": 4.5719828605651855, "learning_rate": 6.451612903225806e-07, "loss": 0.6321, "step": 390 }, { "epoch": 12.8, "grad_norm": 10.500142097473145, "learning_rate": 6.332138590203107e-07, "loss": 0.5613, "step": 400 }, { "epoch": 12.992, "eval_accuracy": 0.75, "eval_loss": 0.5510138273239136, "eval_runtime": 1.1313, "eval_samples_per_second": 28.287, "eval_steps_per_second": 28.287, "step": 406 }, { "epoch": 13.12, "grad_norm": 3.0991406440734863, "learning_rate": 6.212664277180406e-07, "loss": 0.3966, "step": 410 }, { "epoch": 13.44, "grad_norm": 2.3058762550354004, "learning_rate": 6.093189964157706e-07, "loss": 0.5845, "step": 420 }, { "epoch": 13.76, "grad_norm": 2.215249538421631, "learning_rate": 5.973715651135006e-07, "loss": 0.568, "step": 430 }, { "epoch": 13.984, "eval_accuracy": 0.75, "eval_loss": 0.5510228872299194, "eval_runtime": 1.3316, "eval_samples_per_second": 24.031, "eval_steps_per_second": 24.031, "step": 437 }, { "epoch": 14.08, "grad_norm": 2.978492021560669, "learning_rate": 5.854241338112306e-07, "loss": 0.5611, "step": 440 }, { "epoch": 14.4, "grad_norm": 3.9042763710021973, "learning_rate": 5.734767025089605e-07, "loss": 0.4335, "step": 450 }, { "epoch": 14.72, "grad_norm": 8.8019380569458, "learning_rate": 5.615292712066906e-07, "loss": 0.6655, "step": 460 }, { "epoch": 14.975999999999999, "eval_accuracy": 0.75, "eval_loss": 0.5513969659805298, "eval_runtime": 1.1115, "eval_samples_per_second": 28.791, "eval_steps_per_second": 28.791, "step": 468 }, { "epoch": 15.04, "grad_norm": 3.463810920715332, "learning_rate": 5.495818399044206e-07, "loss": 0.5628, "step": 470 }, { "epoch": 15.36, "grad_norm": 1.9772050380706787, "learning_rate": 5.376344086021505e-07, "loss": 0.504, "step": 480 }, { "epoch": 15.68, "grad_norm": 2.6561172008514404, "learning_rate": 5.256869772998806e-07, "loss": 0.7277, "step": 490 }, { "epoch": 16.0, "grad_norm": 5.404987335205078, "learning_rate": 5.137395459976105e-07, "loss": 0.4883, "step": 500 }, { "epoch": 16.0, "eval_accuracy": 0.75, "eval_loss": 0.5522246956825256, "eval_runtime": 1.157, "eval_samples_per_second": 27.658, "eval_steps_per_second": 27.658, "step": 500 }, { "epoch": 16.32, "grad_norm": 4.906336307525635, "learning_rate": 5.017921146953405e-07, "loss": 0.4576, "step": 510 }, { "epoch": 16.64, "grad_norm": 3.543666124343872, "learning_rate": 4.898446833930704e-07, "loss": 0.687, "step": 520 }, { "epoch": 16.96, "grad_norm": 5.162899017333984, "learning_rate": 4.778972520908004e-07, "loss": 0.5317, "step": 530 }, { "epoch": 16.992, "eval_accuracy": 0.75, "eval_loss": 0.5518386960029602, "eval_runtime": 1.1469, "eval_samples_per_second": 27.901, "eval_steps_per_second": 27.901, "step": 531 }, { "epoch": 17.28, "grad_norm": 3.6676950454711914, "learning_rate": 4.6594982078853044e-07, "loss": 0.5024, "step": 540 }, { "epoch": 17.6, "grad_norm": 10.735907554626465, "learning_rate": 4.540023894862604e-07, "loss": 0.6743, "step": 550 }, { "epoch": 17.92, "grad_norm": 4.575161457061768, "learning_rate": 4.4205495818399044e-07, "loss": 0.4501, "step": 560 }, { "epoch": 17.984, "eval_accuracy": 0.75, "eval_loss": 0.5519962906837463, "eval_runtime": 1.0972, "eval_samples_per_second": 29.165, "eval_steps_per_second": 29.165, "step": 562 }, { "epoch": 18.24, "grad_norm": 6.966436862945557, "learning_rate": 4.3010752688172043e-07, "loss": 0.7276, "step": 570 }, { "epoch": 18.56, "grad_norm": 5.1026763916015625, "learning_rate": 4.1816009557945043e-07, "loss": 0.4801, "step": 580 }, { "epoch": 18.88, "grad_norm": 6.751893043518066, "learning_rate": 4.0621266427718037e-07, "loss": 0.4616, "step": 590 }, { "epoch": 18.976, "eval_accuracy": 0.75, "eval_loss": 0.551902174949646, "eval_runtime": 1.27, "eval_samples_per_second": 25.197, "eval_steps_per_second": 25.197, "step": 593 }, { "epoch": 19.2, "grad_norm": 5.350219249725342, "learning_rate": 3.9426523297491037e-07, "loss": 0.4631, "step": 600 }, { "epoch": 19.52, "grad_norm": 5.136310577392578, "learning_rate": 3.8231780167264037e-07, "loss": 0.5746, "step": 610 }, { "epoch": 19.84, "grad_norm": 4.849793910980225, "learning_rate": 3.703703703703703e-07, "loss": 0.4522, "step": 620 }, { "epoch": 20.0, "eval_accuracy": 0.75, "eval_loss": 0.5509653091430664, "eval_runtime": 1.1115, "eval_samples_per_second": 28.789, "eval_steps_per_second": 28.789, "step": 625 }, { "epoch": 20.16, "grad_norm": 8.096334457397461, "learning_rate": 3.5842293906810036e-07, "loss": 0.66, "step": 630 }, { "epoch": 20.48, "grad_norm": 11.139561653137207, "learning_rate": 3.4647550776583036e-07, "loss": 0.4573, "step": 640 }, { "epoch": 20.8, "grad_norm": 5.0489583015441895, "learning_rate": 3.345280764635603e-07, "loss": 0.6326, "step": 650 }, { "epoch": 20.992, "eval_accuracy": 0.75, "eval_loss": 0.5507452487945557, "eval_runtime": 1.2933, "eval_samples_per_second": 24.742, "eval_steps_per_second": 24.742, "step": 656 }, { "epoch": 21.12, "grad_norm": 12.198716163635254, "learning_rate": 3.225806451612903e-07, "loss": 0.7282, "step": 660 }, { "epoch": 21.44, "grad_norm": 4.501183986663818, "learning_rate": 3.106332138590203e-07, "loss": 0.51, "step": 670 }, { "epoch": 21.76, "grad_norm": 5.399625778198242, "learning_rate": 2.986857825567503e-07, "loss": 0.3828, "step": 680 }, { "epoch": 21.984, "eval_accuracy": 0.75, "eval_loss": 0.5508217811584473, "eval_runtime": 1.1768, "eval_samples_per_second": 27.192, "eval_steps_per_second": 27.192, "step": 687 }, { "epoch": 22.08, "grad_norm": 3.282414436340332, "learning_rate": 2.8673835125448024e-07, "loss": 0.6789, "step": 690 }, { "epoch": 22.4, "grad_norm": 4.816540718078613, "learning_rate": 2.747909199522103e-07, "loss": 0.5746, "step": 700 }, { "epoch": 22.72, "grad_norm": 3.5417306423187256, "learning_rate": 2.628434886499403e-07, "loss": 0.4283, "step": 710 }, { "epoch": 22.976, "eval_accuracy": 0.75, "eval_loss": 0.5509472489356995, "eval_runtime": 1.2114, "eval_samples_per_second": 26.417, "eval_steps_per_second": 26.417, "step": 718 }, { "epoch": 23.04, "grad_norm": 5.404343605041504, "learning_rate": 2.508960573476702e-07, "loss": 0.5891, "step": 720 }, { "epoch": 23.36, "grad_norm": 4.924590587615967, "learning_rate": 2.389486260454002e-07, "loss": 0.4529, "step": 730 }, { "epoch": 23.68, "grad_norm": 3.1310863494873047, "learning_rate": 2.270011947431302e-07, "loss": 0.5812, "step": 740 }, { "epoch": 24.0, "grad_norm": 5.183323383331299, "learning_rate": 2.1505376344086022e-07, "loss": 0.6701, "step": 750 }, { "epoch": 24.0, "eval_accuracy": 0.75, "eval_loss": 0.5505539178848267, "eval_runtime": 1.1047, "eval_samples_per_second": 28.967, "eval_steps_per_second": 28.967, "step": 750 }, { "epoch": 24.32, "grad_norm": 11.777995109558105, "learning_rate": 2.0310633213859019e-07, "loss": 0.5262, "step": 760 }, { "epoch": 24.64, "grad_norm": 2.3787074089050293, "learning_rate": 1.9115890083632018e-07, "loss": 0.4884, "step": 770 }, { "epoch": 24.96, "grad_norm": 5.162797927856445, "learning_rate": 1.7921146953405018e-07, "loss": 0.6157, "step": 780 }, { "epoch": 24.992, "eval_accuracy": 0.75, "eval_loss": 0.5503212213516235, "eval_runtime": 1.2194, "eval_samples_per_second": 26.243, "eval_steps_per_second": 26.243, "step": 781 }, { "epoch": 25.28, "grad_norm": 4.399529933929443, "learning_rate": 1.6726403823178015e-07, "loss": 0.5277, "step": 790 }, { "epoch": 25.6, "grad_norm": 8.869352340698242, "learning_rate": 1.5531660692951015e-07, "loss": 0.6222, "step": 800 }, { "epoch": 25.92, "grad_norm": 3.5912718772888184, "learning_rate": 1.4336917562724012e-07, "loss": 0.5657, "step": 810 }, { "epoch": 25.984, "eval_accuracy": 0.75, "eval_loss": 0.5502746105194092, "eval_runtime": 1.0991, "eval_samples_per_second": 29.116, "eval_steps_per_second": 29.116, "step": 812 }, { "epoch": 26.24, "grad_norm": 4.004587173461914, "learning_rate": 1.3142174432497014e-07, "loss": 0.5406, "step": 820 }, { "epoch": 26.56, "grad_norm": 6.145273208618164, "learning_rate": 1.194743130227001e-07, "loss": 0.6507, "step": 830 }, { "epoch": 26.88, "grad_norm": 2.8329687118530273, "learning_rate": 1.0752688172043011e-07, "loss": 0.5127, "step": 840 }, { "epoch": 26.976, "eval_accuracy": 0.75, "eval_loss": 0.5503281354904175, "eval_runtime": 1.277, "eval_samples_per_second": 25.058, "eval_steps_per_second": 25.058, "step": 843 }, { "epoch": 27.2, "grad_norm": 5.049647808074951, "learning_rate": 9.557945041816009e-08, "loss": 0.4542, "step": 850 }, { "epoch": 27.52, "grad_norm": 2.4863924980163574, "learning_rate": 8.363201911589008e-08, "loss": 0.5172, "step": 860 }, { "epoch": 27.84, "grad_norm": 5.455885887145996, "learning_rate": 7.168458781362006e-08, "loss": 0.6178, "step": 870 }, { "epoch": 28.0, "eval_accuracy": 0.75, "eval_loss": 0.5502800941467285, "eval_runtime": 1.1133, "eval_samples_per_second": 28.744, "eval_steps_per_second": 28.744, "step": 875 }, { "epoch": 28.16, "grad_norm": 5.835262298583984, "learning_rate": 5.973715651135006e-08, "loss": 0.4629, "step": 880 }, { "epoch": 28.48, "grad_norm": 5.468852519989014, "learning_rate": 4.7789725209080046e-08, "loss": 0.381, "step": 890 }, { "epoch": 28.8, "grad_norm": 4.890566825866699, "learning_rate": 3.584229390681003e-08, "loss": 0.5679, "step": 900 }, { "epoch": 28.992, "eval_accuracy": 0.75, "eval_loss": 0.5502068996429443, "eval_runtime": 1.1769, "eval_samples_per_second": 27.191, "eval_steps_per_second": 27.191, "step": 906 }, { "epoch": 29.12, "grad_norm": 5.2326884269714355, "learning_rate": 2.3894862604540023e-08, "loss": 0.7681, "step": 910 }, { "epoch": 29.44, "grad_norm": 8.419669151306152, "learning_rate": 1.1947431302270011e-08, "loss": 0.6077, "step": 920 }, { "epoch": 29.76, "grad_norm": 2.1025705337524414, "learning_rate": 0.0, "loss": 0.6102, "step": 930 }, { "epoch": 29.76, "eval_accuracy": 0.75, "eval_loss": 0.5502274632453918, "eval_runtime": 1.6314, "eval_samples_per_second": 19.615, "eval_steps_per_second": 19.615, "step": 930 }, { "epoch": 29.76, "step": 930, "total_flos": 3.8132430847082496e+17, "train_loss": 0.5704158216394404, "train_runtime": 536.3007, "train_samples_per_second": 6.992, "train_steps_per_second": 1.734 } ], "logging_steps": 10, "max_steps": 930, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.8132430847082496e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }