{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9980430528375734, "eval_steps": 500, "global_step": 1149, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02609262883235486, "grad_norm": 2.5580028899176965, "learning_rate": 5e-06, "loss": 0.9679, "step": 10 }, { "epoch": 0.05218525766470972, "grad_norm": 1.2382684301314397, "learning_rate": 5e-06, "loss": 0.8485, "step": 20 }, { "epoch": 0.07827788649706457, "grad_norm": 1.7244888669737717, "learning_rate": 5e-06, "loss": 0.8086, "step": 30 }, { "epoch": 0.10437051532941943, "grad_norm": 0.6684245608414933, "learning_rate": 5e-06, "loss": 0.7921, "step": 40 }, { "epoch": 0.1304631441617743, "grad_norm": 1.2452393350160769, "learning_rate": 5e-06, "loss": 0.7827, "step": 50 }, { "epoch": 0.15655577299412915, "grad_norm": 0.596876900380745, "learning_rate": 5e-06, "loss": 0.7727, "step": 60 }, { "epoch": 0.182648401826484, "grad_norm": 0.6733648796408476, "learning_rate": 5e-06, "loss": 0.7657, "step": 70 }, { "epoch": 0.20874103065883887, "grad_norm": 0.6054360604030639, "learning_rate": 5e-06, "loss": 0.7592, "step": 80 }, { "epoch": 0.23483365949119372, "grad_norm": 0.6078628726535039, "learning_rate": 5e-06, "loss": 0.7575, "step": 90 }, { "epoch": 0.2609262883235486, "grad_norm": 0.667808601099263, "learning_rate": 5e-06, "loss": 0.7535, "step": 100 }, { "epoch": 0.28701891715590344, "grad_norm": 0.5998036644971939, "learning_rate": 5e-06, "loss": 0.7505, "step": 110 }, { "epoch": 0.3131115459882583, "grad_norm": 0.601234098332995, "learning_rate": 5e-06, "loss": 0.7462, "step": 120 }, { "epoch": 0.33920417482061316, "grad_norm": 0.5470502921448889, "learning_rate": 5e-06, "loss": 0.7464, "step": 130 }, { "epoch": 0.365296803652968, "grad_norm": 0.5354881588471849, "learning_rate": 5e-06, "loss": 0.7448, "step": 140 }, { "epoch": 0.3913894324853229, "grad_norm": 0.638179541327912, "learning_rate": 5e-06, "loss": 0.7403, "step": 150 }, { "epoch": 0.41748206131767773, "grad_norm": 0.7539597801432362, "learning_rate": 5e-06, "loss": 0.7415, "step": 160 }, { "epoch": 0.4435746901500326, "grad_norm": 0.6349380651529639, "learning_rate": 5e-06, "loss": 0.7397, "step": 170 }, { "epoch": 0.46966731898238745, "grad_norm": 0.6486113350477142, "learning_rate": 5e-06, "loss": 0.7346, "step": 180 }, { "epoch": 0.4957599478147423, "grad_norm": 0.524423192769518, "learning_rate": 5e-06, "loss": 0.7336, "step": 190 }, { "epoch": 0.5218525766470972, "grad_norm": 0.8675137881298488, "learning_rate": 5e-06, "loss": 0.7357, "step": 200 }, { "epoch": 0.547945205479452, "grad_norm": 0.6181294717672231, "learning_rate": 5e-06, "loss": 0.7324, "step": 210 }, { "epoch": 0.5740378343118069, "grad_norm": 0.5310035054090366, "learning_rate": 5e-06, "loss": 0.7311, "step": 220 }, { "epoch": 0.6001304631441617, "grad_norm": 0.5616827688931022, "learning_rate": 5e-06, "loss": 0.7286, "step": 230 }, { "epoch": 0.6262230919765166, "grad_norm": 0.6811648368494987, "learning_rate": 5e-06, "loss": 0.7291, "step": 240 }, { "epoch": 0.6523157208088715, "grad_norm": 0.7733201970691932, "learning_rate": 5e-06, "loss": 0.7276, "step": 250 }, { "epoch": 0.6784083496412263, "grad_norm": 0.6192873191830528, "learning_rate": 5e-06, "loss": 0.7272, "step": 260 }, { "epoch": 0.7045009784735812, "grad_norm": 0.8068236017156095, "learning_rate": 5e-06, "loss": 0.7249, "step": 270 }, { "epoch": 0.730593607305936, "grad_norm": 0.8150623115239886, "learning_rate": 5e-06, "loss": 0.728, "step": 280 }, { "epoch": 0.7566862361382909, "grad_norm": 0.724911564188439, "learning_rate": 5e-06, "loss": 0.7264, "step": 290 }, { "epoch": 0.7827788649706457, "grad_norm": 0.6359822287899792, "learning_rate": 5e-06, "loss": 0.723, "step": 300 }, { "epoch": 0.8088714938030006, "grad_norm": 0.5527636979800452, "learning_rate": 5e-06, "loss": 0.7232, "step": 310 }, { "epoch": 0.8349641226353555, "grad_norm": 0.499616870483793, "learning_rate": 5e-06, "loss": 0.7222, "step": 320 }, { "epoch": 0.8610567514677103, "grad_norm": 0.5494308637029189, "learning_rate": 5e-06, "loss": 0.7174, "step": 330 }, { "epoch": 0.8871493803000652, "grad_norm": 0.5742968590351128, "learning_rate": 5e-06, "loss": 0.719, "step": 340 }, { "epoch": 0.91324200913242, "grad_norm": 0.48431807220252887, "learning_rate": 5e-06, "loss": 0.7206, "step": 350 }, { "epoch": 0.9393346379647749, "grad_norm": 0.4853602229330187, "learning_rate": 5e-06, "loss": 0.7178, "step": 360 }, { "epoch": 0.9654272667971298, "grad_norm": 0.501484126053991, "learning_rate": 5e-06, "loss": 0.7204, "step": 370 }, { "epoch": 0.9915198956294846, "grad_norm": 0.5012192063059191, "learning_rate": 5e-06, "loss": 0.7179, "step": 380 }, { "epoch": 1.0176125244618395, "grad_norm": 0.9524671132017011, "learning_rate": 5e-06, "loss": 0.7451, "step": 390 }, { "epoch": 1.0437051532941943, "grad_norm": 0.524028424392148, "learning_rate": 5e-06, "loss": 0.6803, "step": 400 }, { "epoch": 1.0697977821265492, "grad_norm": 0.5407200294759114, "learning_rate": 5e-06, "loss": 0.6793, "step": 410 }, { "epoch": 1.095890410958904, "grad_norm": 0.5857287578156233, "learning_rate": 5e-06, "loss": 0.6811, "step": 420 }, { "epoch": 1.121983039791259, "grad_norm": 0.5280070429598902, "learning_rate": 5e-06, "loss": 0.6807, "step": 430 }, { "epoch": 1.1480756686236138, "grad_norm": 0.4857639927609277, "learning_rate": 5e-06, "loss": 0.6776, "step": 440 }, { "epoch": 1.1741682974559686, "grad_norm": 0.5945511698424941, "learning_rate": 5e-06, "loss": 0.68, "step": 450 }, { "epoch": 1.2002609262883235, "grad_norm": 0.589811732149492, "learning_rate": 5e-06, "loss": 0.6831, "step": 460 }, { "epoch": 1.2263535551206783, "grad_norm": 0.5534380098945336, "learning_rate": 5e-06, "loss": 0.6821, "step": 470 }, { "epoch": 1.2524461839530332, "grad_norm": 0.5126525326289294, "learning_rate": 5e-06, "loss": 0.6798, "step": 480 }, { "epoch": 1.278538812785388, "grad_norm": 0.6254986498705398, "learning_rate": 5e-06, "loss": 0.6797, "step": 490 }, { "epoch": 1.304631441617743, "grad_norm": 0.49035273013831415, "learning_rate": 5e-06, "loss": 0.683, "step": 500 }, { "epoch": 1.3307240704500978, "grad_norm": 0.5403945589242636, "learning_rate": 5e-06, "loss": 0.6769, "step": 510 }, { "epoch": 1.3568166992824526, "grad_norm": 0.5501681060254049, "learning_rate": 5e-06, "loss": 0.6807, "step": 520 }, { "epoch": 1.3829093281148075, "grad_norm": 0.6772693119372506, "learning_rate": 5e-06, "loss": 0.6806, "step": 530 }, { "epoch": 1.4090019569471623, "grad_norm": 0.5937727651699936, "learning_rate": 5e-06, "loss": 0.6797, "step": 540 }, { "epoch": 1.4350945857795172, "grad_norm": 0.5931432683082334, "learning_rate": 5e-06, "loss": 0.6769, "step": 550 }, { "epoch": 1.461187214611872, "grad_norm": 0.6044257552885106, "learning_rate": 5e-06, "loss": 0.6821, "step": 560 }, { "epoch": 1.487279843444227, "grad_norm": 0.5487167986355509, "learning_rate": 5e-06, "loss": 0.6779, "step": 570 }, { "epoch": 1.5133724722765818, "grad_norm": 0.5046084173932228, "learning_rate": 5e-06, "loss": 0.6797, "step": 580 }, { "epoch": 1.5394651011089366, "grad_norm": 0.588333435552722, "learning_rate": 5e-06, "loss": 0.681, "step": 590 }, { "epoch": 1.5655577299412915, "grad_norm": 0.5857191319203284, "learning_rate": 5e-06, "loss": 0.6821, "step": 600 }, { "epoch": 1.5916503587736464, "grad_norm": 0.5197393978903909, "learning_rate": 5e-06, "loss": 0.6815, "step": 610 }, { "epoch": 1.6177429876060012, "grad_norm": 0.5171612878613464, "learning_rate": 5e-06, "loss": 0.6786, "step": 620 }, { "epoch": 1.643835616438356, "grad_norm": 0.566201693304294, "learning_rate": 5e-06, "loss": 0.6793, "step": 630 }, { "epoch": 1.669928245270711, "grad_norm": 0.5044017255239589, "learning_rate": 5e-06, "loss": 0.6776, "step": 640 }, { "epoch": 1.6960208741030658, "grad_norm": 0.478490790469601, "learning_rate": 5e-06, "loss": 0.6754, "step": 650 }, { "epoch": 1.7221135029354206, "grad_norm": 0.4963811798560764, "learning_rate": 5e-06, "loss": 0.6798, "step": 660 }, { "epoch": 1.7482061317677755, "grad_norm": 0.5278784589567511, "learning_rate": 5e-06, "loss": 0.6768, "step": 670 }, { "epoch": 1.7742987606001304, "grad_norm": 0.5387531965847336, "learning_rate": 5e-06, "loss": 0.6795, "step": 680 }, { "epoch": 1.8003913894324852, "grad_norm": 0.5875937914583538, "learning_rate": 5e-06, "loss": 0.6781, "step": 690 }, { "epoch": 1.82648401826484, "grad_norm": 0.5355905144164954, "learning_rate": 5e-06, "loss": 0.6792, "step": 700 }, { "epoch": 1.852576647097195, "grad_norm": 0.6338025629920409, "learning_rate": 5e-06, "loss": 0.6821, "step": 710 }, { "epoch": 1.8786692759295498, "grad_norm": 0.5291801310990664, "learning_rate": 5e-06, "loss": 0.677, "step": 720 }, { "epoch": 1.9047619047619047, "grad_norm": 0.590990170736626, "learning_rate": 5e-06, "loss": 0.6766, "step": 730 }, { "epoch": 1.9308545335942595, "grad_norm": 0.5482515525172131, "learning_rate": 5e-06, "loss": 0.6751, "step": 740 }, { "epoch": 1.9569471624266144, "grad_norm": 0.620356822725272, "learning_rate": 5e-06, "loss": 0.6765, "step": 750 }, { "epoch": 1.9830397912589692, "grad_norm": 0.5642132027456009, "learning_rate": 5e-06, "loss": 0.6776, "step": 760 }, { "epoch": 2.009132420091324, "grad_norm": 0.7543688135393036, "learning_rate": 5e-06, "loss": 0.7153, "step": 770 }, { "epoch": 2.035225048923679, "grad_norm": 0.5581679413180287, "learning_rate": 5e-06, "loss": 0.6374, "step": 780 }, { "epoch": 2.061317677756034, "grad_norm": 0.6393564816609882, "learning_rate": 5e-06, "loss": 0.6364, "step": 790 }, { "epoch": 2.0874103065883887, "grad_norm": 0.614009057667238, "learning_rate": 5e-06, "loss": 0.6402, "step": 800 }, { "epoch": 2.1135029354207435, "grad_norm": 0.5346940357958085, "learning_rate": 5e-06, "loss": 0.6407, "step": 810 }, { "epoch": 2.1395955642530984, "grad_norm": 0.559106756693009, "learning_rate": 5e-06, "loss": 0.638, "step": 820 }, { "epoch": 2.1656881930854532, "grad_norm": 0.603074822031344, "learning_rate": 5e-06, "loss": 0.6363, "step": 830 }, { "epoch": 2.191780821917808, "grad_norm": 0.6669696137807871, "learning_rate": 5e-06, "loss": 0.6394, "step": 840 }, { "epoch": 2.217873450750163, "grad_norm": 0.6196755868842616, "learning_rate": 5e-06, "loss": 0.6407, "step": 850 }, { "epoch": 2.243966079582518, "grad_norm": 0.5462481539172109, "learning_rate": 5e-06, "loss": 0.6407, "step": 860 }, { "epoch": 2.2700587084148727, "grad_norm": 0.5378190251307421, "learning_rate": 5e-06, "loss": 0.6401, "step": 870 }, { "epoch": 2.2961513372472275, "grad_norm": 0.5143159135941981, "learning_rate": 5e-06, "loss": 0.6403, "step": 880 }, { "epoch": 2.3222439660795824, "grad_norm": 0.514592343488391, "learning_rate": 5e-06, "loss": 0.6412, "step": 890 }, { "epoch": 2.3483365949119372, "grad_norm": 0.5516158921785709, "learning_rate": 5e-06, "loss": 0.6434, "step": 900 }, { "epoch": 2.374429223744292, "grad_norm": 0.669009619898686, "learning_rate": 5e-06, "loss": 0.6386, "step": 910 }, { "epoch": 2.400521852576647, "grad_norm": 0.6127707444182544, "learning_rate": 5e-06, "loss": 0.6378, "step": 920 }, { "epoch": 2.426614481409002, "grad_norm": 0.6147599175693709, "learning_rate": 5e-06, "loss": 0.6411, "step": 930 }, { "epoch": 2.4527071102413567, "grad_norm": 0.5183387934748579, "learning_rate": 5e-06, "loss": 0.6431, "step": 940 }, { "epoch": 2.4787997390737115, "grad_norm": 0.5481797536089282, "learning_rate": 5e-06, "loss": 0.6415, "step": 950 }, { "epoch": 2.5048923679060664, "grad_norm": 0.49281844671077113, "learning_rate": 5e-06, "loss": 0.6416, "step": 960 }, { "epoch": 2.5309849967384213, "grad_norm": 0.5590165679106437, "learning_rate": 5e-06, "loss": 0.6413, "step": 970 }, { "epoch": 2.557077625570776, "grad_norm": 0.4928056280476181, "learning_rate": 5e-06, "loss": 0.6398, "step": 980 }, { "epoch": 2.583170254403131, "grad_norm": 0.6836197415170101, "learning_rate": 5e-06, "loss": 0.6458, "step": 990 }, { "epoch": 2.609262883235486, "grad_norm": 0.5754142213185701, "learning_rate": 5e-06, "loss": 0.6391, "step": 1000 }, { "epoch": 2.6353555120678407, "grad_norm": 0.5860168635077211, "learning_rate": 5e-06, "loss": 0.6412, "step": 1010 }, { "epoch": 2.6614481409001955, "grad_norm": 0.7689811537018695, "learning_rate": 5e-06, "loss": 0.6419, "step": 1020 }, { "epoch": 2.6875407697325504, "grad_norm": 0.7522119854978059, "learning_rate": 5e-06, "loss": 0.643, "step": 1030 }, { "epoch": 2.7136333985649053, "grad_norm": 0.5487355635946237, "learning_rate": 5e-06, "loss": 0.6395, "step": 1040 }, { "epoch": 2.73972602739726, "grad_norm": 0.6331210725666454, "learning_rate": 5e-06, "loss": 0.6409, "step": 1050 }, { "epoch": 2.765818656229615, "grad_norm": 0.5745683262495273, "learning_rate": 5e-06, "loss": 0.6447, "step": 1060 }, { "epoch": 2.79191128506197, "grad_norm": 0.5686919776335425, "learning_rate": 5e-06, "loss": 0.6396, "step": 1070 }, { "epoch": 2.8180039138943247, "grad_norm": 0.6015098918868974, "learning_rate": 5e-06, "loss": 0.6414, "step": 1080 }, { "epoch": 2.8440965427266796, "grad_norm": 0.4915249083089861, "learning_rate": 5e-06, "loss": 0.641, "step": 1090 }, { "epoch": 2.8701891715590344, "grad_norm": 0.5923242648103054, "learning_rate": 5e-06, "loss": 0.6418, "step": 1100 }, { "epoch": 2.8962818003913893, "grad_norm": 0.5453451599354968, "learning_rate": 5e-06, "loss": 0.6435, "step": 1110 }, { "epoch": 2.922374429223744, "grad_norm": 0.6120521256139003, "learning_rate": 5e-06, "loss": 0.6448, "step": 1120 }, { "epoch": 2.948467058056099, "grad_norm": 0.5414568129256186, "learning_rate": 5e-06, "loss": 0.6423, "step": 1130 }, { "epoch": 2.974559686888454, "grad_norm": 0.5372353896161117, "learning_rate": 5e-06, "loss": 0.6412, "step": 1140 }, { "epoch": 2.9980430528375734, "step": 1149, "total_flos": 1924199035699200.0, "train_loss": 0.6906216756689538, "train_runtime": 17455.8647, "train_samples_per_second": 33.704, "train_steps_per_second": 0.066 } ], "logging_steps": 10, "max_steps": 1149, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1924199035699200.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }