{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.089058524173028, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05089058524173028, "grad_norm": 2.2103662490844727, "learning_rate": 0.0001999771640780308, "loss": 3.0938, "num_input_tokens_seen": 190176, "step": 5 }, { "epoch": 0.10178117048346055, "grad_norm": 0.8035047054290771, "learning_rate": 0.0001998844110196681, "loss": 1.471, "num_input_tokens_seen": 378656, "step": 10 }, { "epoch": 0.15267175572519084, "grad_norm": 0.44624239206314087, "learning_rate": 0.00019972037971811802, "loss": 1.2452, "num_input_tokens_seen": 570208, "step": 15 }, { "epoch": 0.2035623409669211, "grad_norm": 0.41621240973472595, "learning_rate": 0.00019948518722731206, "loss": 1.1957, "num_input_tokens_seen": 763040, "step": 20 }, { "epoch": 0.2544529262086514, "grad_norm": 0.3875686824321747, "learning_rate": 0.0001991790013823246, "loss": 1.1319, "num_input_tokens_seen": 959200, "step": 25 }, { "epoch": 0.3053435114503817, "grad_norm": 0.4382984936237335, "learning_rate": 0.00019880204067960472, "loss": 1.1209, "num_input_tokens_seen": 1153728, "step": 30 }, { "epoch": 0.356234096692112, "grad_norm": 0.45683977007865906, "learning_rate": 0.00019835457412105528, "loss": 1.123, "num_input_tokens_seen": 1351616, "step": 35 }, { "epoch": 0.4071246819338422, "grad_norm": 0.4628298580646515, "learning_rate": 0.00019783692102207155, "loss": 1.0532, "num_input_tokens_seen": 1537696, "step": 40 }, { "epoch": 0.4580152671755725, "grad_norm": 0.5620412826538086, "learning_rate": 0.00019724945078367513, "loss": 1.0805, "num_input_tokens_seen": 1730144, "step": 45 }, { "epoch": 0.5089058524173028, "grad_norm": 0.4734291732311249, "learning_rate": 0.00019659258262890683, "loss": 1.0572, "num_input_tokens_seen": 1916608, "step": 50 }, { "epoch": 0.5597964376590331, "grad_norm": 0.46405333280563354, "learning_rate": 0.00019586678530366606, "loss": 1.0069, "num_input_tokens_seen": 2103712, "step": 55 }, { "epoch": 0.6106870229007634, "grad_norm": 0.4856356084346771, "learning_rate": 0.00019507257674221027, "loss": 1.0293, "num_input_tokens_seen": 2289888, "step": 60 }, { "epoch": 0.6615776081424937, "grad_norm": 0.4761466085910797, "learning_rate": 0.00019421052369755334, "loss": 0.9994, "num_input_tokens_seen": 2477888, "step": 65 }, { "epoch": 0.712468193384224, "grad_norm": 0.5239170789718628, "learning_rate": 0.0001932812413370265, "loss": 1.0159, "num_input_tokens_seen": 2662848, "step": 70 }, { "epoch": 0.7633587786259542, "grad_norm": 0.4161457419395447, "learning_rate": 0.00019228539280329038, "loss": 1.0233, "num_input_tokens_seen": 2861024, "step": 75 }, { "epoch": 0.8142493638676844, "grad_norm": 0.5579718351364136, "learning_rate": 0.00019122368874111172, "loss": 1.0109, "num_input_tokens_seen": 3043712, "step": 80 }, { "epoch": 0.8651399491094147, "grad_norm": 0.4595694839954376, "learning_rate": 0.0001900968867902419, "loss": 1.0132, "num_input_tokens_seen": 3231840, "step": 85 }, { "epoch": 0.916030534351145, "grad_norm": 0.49271535873413086, "learning_rate": 0.00018890579104475995, "loss": 1.0079, "num_input_tokens_seen": 3429632, "step": 90 }, { "epoch": 0.9669211195928753, "grad_norm": 0.5259862542152405, "learning_rate": 0.00018765125147926476, "loss": 0.9515, "num_input_tokens_seen": 3620000, "step": 95 }, { "epoch": 1.0178117048346056, "grad_norm": 0.444477379322052, "learning_rate": 0.00018633416334232753, "loss": 0.93, "num_input_tokens_seen": 3809824, "step": 100 }, { "epoch": 1.0687022900763359, "grad_norm": 0.42680156230926514, "learning_rate": 0.0001849554665176354, "loss": 0.8925, "num_input_tokens_seen": 4005472, "step": 105 }, { "epoch": 1.1195928753180662, "grad_norm": 0.6088462471961975, "learning_rate": 0.00018351614485328388, "loss": 0.8715, "num_input_tokens_seen": 4192480, "step": 110 }, { "epoch": 1.1704834605597965, "grad_norm": 0.5572230815887451, "learning_rate": 0.0001820172254596956, "loss": 0.8569, "num_input_tokens_seen": 4379296, "step": 115 }, { "epoch": 1.2213740458015268, "grad_norm": 0.5509177446365356, "learning_rate": 0.00018045977797666684, "loss": 0.8269, "num_input_tokens_seen": 4568352, "step": 120 }, { "epoch": 1.272264631043257, "grad_norm": 0.67293781042099, "learning_rate": 0.00017884491381006478, "loss": 0.84, "num_input_tokens_seen": 4752928, "step": 125 }, { "epoch": 1.3231552162849873, "grad_norm": 0.5845524668693542, "learning_rate": 0.00017717378533872017, "loss": 0.8243, "num_input_tokens_seen": 4939008, "step": 130 }, { "epoch": 1.3740458015267176, "grad_norm": 0.5623295307159424, "learning_rate": 0.00017544758509208146, "loss": 0.8624, "num_input_tokens_seen": 5133664, "step": 135 }, { "epoch": 1.424936386768448, "grad_norm": 0.5870885848999023, "learning_rate": 0.00017366754489921694, "loss": 0.8543, "num_input_tokens_seen": 5332000, "step": 140 }, { "epoch": 1.4758269720101782, "grad_norm": 0.5874502062797546, "learning_rate": 0.00017183493500977278, "loss": 0.842, "num_input_tokens_seen": 5520640, "step": 145 }, { "epoch": 1.5267175572519083, "grad_norm": 0.5786451101303101, "learning_rate": 0.0001699510631875134, "loss": 0.8268, "num_input_tokens_seen": 5710528, "step": 150 }, { "epoch": 1.5776081424936388, "grad_norm": 0.5929097533226013, "learning_rate": 0.00016801727377709194, "loss": 0.8238, "num_input_tokens_seen": 5907264, "step": 155 }, { "epoch": 1.6284987277353689, "grad_norm": 0.570965051651001, "learning_rate": 0.00016603494674471593, "loss": 0.8324, "num_input_tokens_seen": 6095776, "step": 160 }, { "epoch": 1.6793893129770994, "grad_norm": 0.6001121401786804, "learning_rate": 0.0001640054966933935, "loss": 0.8431, "num_input_tokens_seen": 6281280, "step": 165 }, { "epoch": 1.7302798982188294, "grad_norm": 0.5865895748138428, "learning_rate": 0.00016193037185346224, "loss": 0.8054, "num_input_tokens_seen": 6470368, "step": 170 }, { "epoch": 1.78117048346056, "grad_norm": 0.6706877946853638, "learning_rate": 0.00015981105304912162, "loss": 0.8255, "num_input_tokens_seen": 6658944, "step": 175 }, { "epoch": 1.83206106870229, "grad_norm": 0.6549966931343079, "learning_rate": 0.0001576490526417059, "loss": 0.8334, "num_input_tokens_seen": 6842880, "step": 180 }, { "epoch": 1.8829516539440203, "grad_norm": 0.6813002228736877, "learning_rate": 0.0001554459134504523, "loss": 0.8354, "num_input_tokens_seen": 7034432, "step": 185 }, { "epoch": 1.9338422391857506, "grad_norm": 0.6717746257781982, "learning_rate": 0.00015320320765153367, "loss": 0.8294, "num_input_tokens_seen": 7228672, "step": 190 }, { "epoch": 1.984732824427481, "grad_norm": 0.6509274840354919, "learning_rate": 0.00015092253565614233, "loss": 0.8561, "num_input_tokens_seen": 7414144, "step": 195 }, { "epoch": 2.035623409669211, "grad_norm": 0.6088241338729858, "learning_rate": 0.00014860552496842494, "loss": 0.6968, "num_input_tokens_seen": 7603104, "step": 200 }, { "epoch": 2.0865139949109412, "grad_norm": 0.7045843601226807, "learning_rate": 0.00014625382902408356, "loss": 0.6587, "num_input_tokens_seen": 7794016, "step": 205 }, { "epoch": 2.1374045801526718, "grad_norm": 0.6975011825561523, "learning_rate": 0.00014386912601047213, "loss": 0.6276, "num_input_tokens_seen": 7987168, "step": 210 }, { "epoch": 2.188295165394402, "grad_norm": 0.7472736835479736, "learning_rate": 0.00014145311766902957, "loss": 0.6451, "num_input_tokens_seen": 8180224, "step": 215 }, { "epoch": 2.2391857506361323, "grad_norm": 0.7247308492660522, "learning_rate": 0.00013900752808090468, "loss": 0.6776, "num_input_tokens_seen": 8371136, "step": 220 }, { "epoch": 2.2900763358778624, "grad_norm": 0.8300191164016724, "learning_rate": 0.00013653410243663952, "loss": 0.6307, "num_input_tokens_seen": 8552064, "step": 225 }, { "epoch": 2.340966921119593, "grad_norm": 0.791684627532959, "learning_rate": 0.00013403460579078833, "loss": 0.6623, "num_input_tokens_seen": 8737120, "step": 230 }, { "epoch": 2.391857506361323, "grad_norm": 0.7664393782615662, "learning_rate": 0.0001315108218023621, "loss": 0.6754, "num_input_tokens_seen": 8928608, "step": 235 }, { "epoch": 2.4427480916030535, "grad_norm": 0.7355414032936096, "learning_rate": 0.0001289645514619963, "loss": 0.6518, "num_input_tokens_seen": 9120032, "step": 240 }, { "epoch": 2.4936386768447836, "grad_norm": 0.7242420315742493, "learning_rate": 0.00012639761180675098, "loss": 0.6331, "num_input_tokens_seen": 9307840, "step": 245 }, { "epoch": 2.544529262086514, "grad_norm": 0.7869409322738647, "learning_rate": 0.00012381183462345982, "loss": 0.6588, "num_input_tokens_seen": 9499104, "step": 250 }, { "epoch": 2.595419847328244, "grad_norm": 0.7066891193389893, "learning_rate": 0.0001212090651415537, "loss": 0.6657, "num_input_tokens_seen": 9694752, "step": 255 }, { "epoch": 2.6463104325699747, "grad_norm": 0.8193861842155457, "learning_rate": 0.00011859116071629149, "loss": 0.6707, "num_input_tokens_seen": 9889824, "step": 260 }, { "epoch": 2.6972010178117047, "grad_norm": 0.7292617559432983, "learning_rate": 0.00011595998950333793, "loss": 0.6624, "num_input_tokens_seen": 10080896, "step": 265 }, { "epoch": 2.7480916030534353, "grad_norm": 0.7750299572944641, "learning_rate": 0.00011331742912563413, "loss": 0.6671, "num_input_tokens_seen": 10278144, "step": 270 }, { "epoch": 2.7989821882951653, "grad_norm": 0.7738965153694153, "learning_rate": 0.00011066536533351202, "loss": 0.6654, "num_input_tokens_seen": 10466528, "step": 275 }, { "epoch": 2.849872773536896, "grad_norm": 0.8403007984161377, "learning_rate": 0.00010800569065900933, "loss": 0.6681, "num_input_tokens_seen": 10649024, "step": 280 }, { "epoch": 2.900763358778626, "grad_norm": 0.8651427626609802, "learning_rate": 0.0001053403030653449, "loss": 0.6718, "num_input_tokens_seen": 10834816, "step": 285 }, { "epoch": 2.9516539440203564, "grad_norm": 0.8150460720062256, "learning_rate": 0.00010267110459251823, "loss": 0.637, "num_input_tokens_seen": 11027616, "step": 290 }, { "epoch": 3.0025445292620865, "grad_norm": 0.7902624011039734, "learning_rate": 0.0001, "loss": 0.6567, "num_input_tokens_seen": 11217664, "step": 295 }, { "epoch": 3.053435114503817, "grad_norm": 0.7837531566619873, "learning_rate": 9.73288954074818e-05, "loss": 0.5179, "num_input_tokens_seen": 11406624, "step": 300 }, { "epoch": 3.104325699745547, "grad_norm": 0.893497109413147, "learning_rate": 9.46596969346551e-05, "loss": 0.5329, "num_input_tokens_seen": 11596096, "step": 305 }, { "epoch": 3.1552162849872776, "grad_norm": 0.8641018867492676, "learning_rate": 9.199430934099068e-05, "loss": 0.5039, "num_input_tokens_seen": 11788992, "step": 310 }, { "epoch": 3.2061068702290076, "grad_norm": 0.9465248584747314, "learning_rate": 8.933463466648798e-05, "loss": 0.5085, "num_input_tokens_seen": 11986976, "step": 315 }, { "epoch": 3.2569974554707377, "grad_norm": 0.8461422920227051, "learning_rate": 8.66825708743659e-05, "loss": 0.5065, "num_input_tokens_seen": 12179136, "step": 320 }, { "epoch": 3.3078880407124682, "grad_norm": 0.9939844608306885, "learning_rate": 8.404001049666211e-05, "loss": 0.5059, "num_input_tokens_seen": 12368608, "step": 325 }, { "epoch": 3.3587786259541983, "grad_norm": 0.8322144150733948, "learning_rate": 8.140883928370855e-05, "loss": 0.5089, "num_input_tokens_seen": 12556256, "step": 330 }, { "epoch": 3.409669211195929, "grad_norm": 0.8849288821220398, "learning_rate": 7.879093485844635e-05, "loss": 0.4853, "num_input_tokens_seen": 12746784, "step": 335 }, { "epoch": 3.460559796437659, "grad_norm": 0.898762047290802, "learning_rate": 7.618816537654018e-05, "loss": 0.5119, "num_input_tokens_seen": 12944096, "step": 340 }, { "epoch": 3.5114503816793894, "grad_norm": 0.9055672883987427, "learning_rate": 7.360238819324903e-05, "loss": 0.5024, "num_input_tokens_seen": 13139904, "step": 345 }, { "epoch": 3.5623409669211195, "grad_norm": 0.8258459568023682, "learning_rate": 7.10354485380037e-05, "loss": 0.5074, "num_input_tokens_seen": 13328480, "step": 350 }, { "epoch": 3.61323155216285, "grad_norm": 0.8779503703117371, "learning_rate": 6.848917819763793e-05, "loss": 0.5078, "num_input_tokens_seen": 13518112, "step": 355 }, { "epoch": 3.66412213740458, "grad_norm": 0.8894705176353455, "learning_rate": 6.596539420921171e-05, "loss": 0.5211, "num_input_tokens_seen": 13705632, "step": 360 }, { "epoch": 3.7150127226463106, "grad_norm": 0.8531593680381775, "learning_rate": 6.34658975633605e-05, "loss": 0.5042, "num_input_tokens_seen": 13896896, "step": 365 }, { "epoch": 3.7659033078880406, "grad_norm": 0.859188973903656, "learning_rate": 6.0992471919095315e-05, "loss": 0.5038, "num_input_tokens_seen": 14081888, "step": 370 }, { "epoch": 3.816793893129771, "grad_norm": 0.9424790740013123, "learning_rate": 5.854688233097045e-05, "loss": 0.5046, "num_input_tokens_seen": 14270560, "step": 375 }, { "epoch": 3.867684478371501, "grad_norm": 0.90933758020401, "learning_rate": 5.613087398952792e-05, "loss": 0.5196, "num_input_tokens_seen": 14461120, "step": 380 }, { "epoch": 3.9185750636132317, "grad_norm": 0.9108320474624634, "learning_rate": 5.37461709759165e-05, "loss": 0.4898, "num_input_tokens_seen": 14648096, "step": 385 }, { "epoch": 3.969465648854962, "grad_norm": 0.8353007435798645, "learning_rate": 5.139447503157513e-05, "loss": 0.5283, "num_input_tokens_seen": 14839840, "step": 390 }, { "epoch": 4.020356234096692, "grad_norm": 0.8553094863891602, "learning_rate": 4.9077464343857694e-05, "loss": 0.4485, "num_input_tokens_seen": 15020448, "step": 395 }, { "epoch": 4.071246819338422, "grad_norm": 0.9495893120765686, "learning_rate": 4.6796792348466356e-05, "loss": 0.4041, "num_input_tokens_seen": 15210368, "step": 400 }, { "epoch": 4.122137404580153, "grad_norm": 0.9808993339538574, "learning_rate": 4.4554086549547715e-05, "loss": 0.4028, "num_input_tokens_seen": 15399296, "step": 405 }, { "epoch": 4.1730279898218825, "grad_norm": 1.0066555738449097, "learning_rate": 4.23509473582941e-05, "loss": 0.4001, "num_input_tokens_seen": 15580416, "step": 410 }, { "epoch": 4.223918575063613, "grad_norm": 0.987274706363678, "learning_rate": 4.0188946950878404e-05, "loss": 0.3725, "num_input_tokens_seen": 15768480, "step": 415 }, { "epoch": 4.2748091603053435, "grad_norm": 0.8921579718589783, "learning_rate": 3.806962814653779e-05, "loss": 0.3941, "num_input_tokens_seen": 15951072, "step": 420 }, { "epoch": 4.325699745547074, "grad_norm": 1.0258628129959106, "learning_rate": 3.5994503306606497e-05, "loss": 0.4274, "num_input_tokens_seen": 16148512, "step": 425 }, { "epoch": 4.376590330788804, "grad_norm": 0.9532390236854553, "learning_rate": 3.3965053255284084e-05, "loss": 0.394, "num_input_tokens_seen": 16341600, "step": 430 }, { "epoch": 4.427480916030534, "grad_norm": 0.9402347803115845, "learning_rate": 3.198272622290804e-05, "loss": 0.4072, "num_input_tokens_seen": 16533792, "step": 435 }, { "epoch": 4.478371501272265, "grad_norm": 0.9012470841407776, "learning_rate": 3.0048936812486615e-05, "loss": 0.4025, "num_input_tokens_seen": 16728128, "step": 440 }, { "epoch": 4.529262086513995, "grad_norm": 1.046890377998352, "learning_rate": 2.8165064990227252e-05, "loss": 0.3966, "num_input_tokens_seen": 16922560, "step": 445 }, { "epoch": 4.580152671755725, "grad_norm": 0.9989413619041443, "learning_rate": 2.6332455100783083e-05, "loss": 0.394, "num_input_tokens_seen": 17106272, "step": 450 }, { "epoch": 4.631043256997455, "grad_norm": 0.9461767077445984, "learning_rate": 2.4552414907918564e-05, "loss": 0.4152, "num_input_tokens_seen": 17298240, "step": 455 }, { "epoch": 4.681933842239186, "grad_norm": 1.0528404712677002, "learning_rate": 2.282621466127982e-05, "loss": 0.408, "num_input_tokens_seen": 17490624, "step": 460 }, { "epoch": 4.732824427480916, "grad_norm": 1.0558110475540161, "learning_rate": 2.1155086189935224e-05, "loss": 0.3924, "num_input_tokens_seen": 17677632, "step": 465 }, { "epoch": 4.783715012722646, "grad_norm": 0.8918471932411194, "learning_rate": 1.9540222023333166e-05, "loss": 0.4161, "num_input_tokens_seen": 17867872, "step": 470 }, { "epoch": 4.8346055979643765, "grad_norm": 0.9665652513504028, "learning_rate": 1.7982774540304403e-05, "loss": 0.3904, "num_input_tokens_seen": 18055552, "step": 475 }, { "epoch": 4.885496183206107, "grad_norm": 0.9551387429237366, "learning_rate": 1.6483855146716152e-05, "loss": 0.4057, "num_input_tokens_seen": 18247104, "step": 480 }, { "epoch": 4.9363867684478375, "grad_norm": 0.9778920412063599, "learning_rate": 1.504453348236461e-05, "loss": 0.3934, "num_input_tokens_seen": 18434752, "step": 485 }, { "epoch": 4.987277353689567, "grad_norm": 0.9678117632865906, "learning_rate": 1.3665836657672493e-05, "loss": 0.4161, "num_input_tokens_seen": 18633824, "step": 490 }, { "epoch": 5.038167938931298, "grad_norm": 0.8953499794006348, "learning_rate": 1.2348748520735221e-05, "loss": 0.3599, "num_input_tokens_seen": 18825024, "step": 495 }, { "epoch": 5.089058524173028, "grad_norm": 0.9666320085525513, "learning_rate": 1.1094208955240081e-05, "loss": 0.356, "num_input_tokens_seen": 19013664, "step": 500 } ], "logging_steps": 5, "max_steps": 588, "num_input_tokens_seen": 19013664, "num_train_epochs": 6, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.585691706551173e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }