|
{ |
|
"best_metric": 0.10325244069099426, |
|
"best_model_checkpoint": "./fine-tuned/checkpoint-12500", |
|
"epoch": 2.195486080618249, |
|
"eval_steps": 100, |
|
"global_step": 12500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008781944322472996, |
|
"grad_norm": 212427.96875, |
|
"learning_rate": 2.9934129632882487e-05, |
|
"loss": 0.5421, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.017563888644945992, |
|
"grad_norm": 15316.291015625, |
|
"learning_rate": 2.9868259265764974e-05, |
|
"loss": 0.1903, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.017563888644945992, |
|
"eval_loss": 0.16095133125782013, |
|
"eval_runtime": 175.5949, |
|
"eval_samples_per_second": 25.399, |
|
"eval_steps_per_second": 3.178, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.026345832967418988, |
|
"grad_norm": 21344.13671875, |
|
"learning_rate": 2.980238889864746e-05, |
|
"loss": 0.1742, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.035127777289891984, |
|
"grad_norm": 26603.357421875, |
|
"learning_rate": 2.973651853152995e-05, |
|
"loss": 0.164, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.035127777289891984, |
|
"eval_loss": 0.14671418070793152, |
|
"eval_runtime": 175.3478, |
|
"eval_samples_per_second": 25.435, |
|
"eval_steps_per_second": 3.182, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04390972161236498, |
|
"grad_norm": 18468.01953125, |
|
"learning_rate": 2.9670648164412437e-05, |
|
"loss": 0.1697, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.052691665934837977, |
|
"grad_norm": 15799.6875, |
|
"learning_rate": 2.9604777797294924e-05, |
|
"loss": 0.161, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.052691665934837977, |
|
"eval_loss": 0.14008501172065735, |
|
"eval_runtime": 175.2345, |
|
"eval_samples_per_second": 25.452, |
|
"eval_steps_per_second": 3.184, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06147361025731097, |
|
"grad_norm": 17163.763671875, |
|
"learning_rate": 2.953890743017741e-05, |
|
"loss": 0.1634, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.07025555457978397, |
|
"grad_norm": 17603.025390625, |
|
"learning_rate": 2.94730370630599e-05, |
|
"loss": 0.1543, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07025555457978397, |
|
"eval_loss": 0.13591521978378296, |
|
"eval_runtime": 175.0506, |
|
"eval_samples_per_second": 25.478, |
|
"eval_steps_per_second": 3.188, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07903749890225696, |
|
"grad_norm": 12623.9189453125, |
|
"learning_rate": 2.9407166695942387e-05, |
|
"loss": 0.1417, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08781944322472995, |
|
"grad_norm": 14828.5, |
|
"learning_rate": 2.9341296328824874e-05, |
|
"loss": 0.1403, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08781944322472995, |
|
"eval_loss": 0.13329531252384186, |
|
"eval_runtime": 175.1721, |
|
"eval_samples_per_second": 25.461, |
|
"eval_steps_per_second": 3.185, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09660138754720295, |
|
"grad_norm": 16192.8515625, |
|
"learning_rate": 2.927542596170736e-05, |
|
"loss": 0.1444, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.10538333186967595, |
|
"grad_norm": 20510.47265625, |
|
"learning_rate": 2.9209555594589847e-05, |
|
"loss": 0.1466, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.10538333186967595, |
|
"eval_loss": 0.1307835429906845, |
|
"eval_runtime": 175.06, |
|
"eval_samples_per_second": 25.477, |
|
"eval_steps_per_second": 3.187, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.11416527619214895, |
|
"grad_norm": 10555.8408203125, |
|
"learning_rate": 2.9143685227472337e-05, |
|
"loss": 0.1472, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.12294722051462194, |
|
"grad_norm": 12451.990234375, |
|
"learning_rate": 2.907781486035482e-05, |
|
"loss": 0.1415, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.12294722051462194, |
|
"eval_loss": 0.1288571059703827, |
|
"eval_runtime": 175.1799, |
|
"eval_samples_per_second": 25.46, |
|
"eval_steps_per_second": 3.185, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.13172916483709493, |
|
"grad_norm": 11173.96875, |
|
"learning_rate": 2.901194449323731e-05, |
|
"loss": 0.1368, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.14051110915956794, |
|
"grad_norm": 47561.75, |
|
"learning_rate": 2.8946074126119797e-05, |
|
"loss": 0.1399, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.14051110915956794, |
|
"eval_loss": 0.12726937234401703, |
|
"eval_runtime": 175.2229, |
|
"eval_samples_per_second": 25.453, |
|
"eval_steps_per_second": 3.185, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.14929305348204092, |
|
"grad_norm": 11766.6767578125, |
|
"learning_rate": 2.8880203759002283e-05, |
|
"loss": 0.1433, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.15807499780451392, |
|
"grad_norm": 14977.416015625, |
|
"learning_rate": 2.881433339188477e-05, |
|
"loss": 0.1371, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.15807499780451392, |
|
"eval_loss": 0.12529444694519043, |
|
"eval_runtime": 174.8253, |
|
"eval_samples_per_second": 25.511, |
|
"eval_steps_per_second": 3.192, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1668569421269869, |
|
"grad_norm": 11109.173828125, |
|
"learning_rate": 2.874846302476726e-05, |
|
"loss": 0.1292, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.1756388864494599, |
|
"grad_norm": 9897.7958984375, |
|
"learning_rate": 2.8682592657649747e-05, |
|
"loss": 0.1351, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1756388864494599, |
|
"eval_loss": 0.12485189735889435, |
|
"eval_runtime": 174.8115, |
|
"eval_samples_per_second": 25.513, |
|
"eval_steps_per_second": 3.192, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18442083077193291, |
|
"grad_norm": 20060.55859375, |
|
"learning_rate": 2.8616722290532233e-05, |
|
"loss": 0.1303, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.1932027750944059, |
|
"grad_norm": 10244.4052734375, |
|
"learning_rate": 2.855085192341472e-05, |
|
"loss": 0.1413, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1932027750944059, |
|
"eval_loss": 0.12359971553087234, |
|
"eval_runtime": 175.122, |
|
"eval_samples_per_second": 25.468, |
|
"eval_steps_per_second": 3.186, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2019847194168789, |
|
"grad_norm": 36993.25, |
|
"learning_rate": 2.848498155629721e-05, |
|
"loss": 0.1275, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.2107666637393519, |
|
"grad_norm": 11102.2646484375, |
|
"learning_rate": 2.8419111189179697e-05, |
|
"loss": 0.1377, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2107666637393519, |
|
"eval_loss": 0.12276890873908997, |
|
"eval_runtime": 175.1309, |
|
"eval_samples_per_second": 25.467, |
|
"eval_steps_per_second": 3.186, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.21954860806182488, |
|
"grad_norm": 10398.369140625, |
|
"learning_rate": 2.835324082206218e-05, |
|
"loss": 0.1356, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.2283305523842979, |
|
"grad_norm": 14664.177734375, |
|
"learning_rate": 2.828737045494467e-05, |
|
"loss": 0.1309, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.2283305523842979, |
|
"eval_loss": 0.1219501867890358, |
|
"eval_runtime": 174.8703, |
|
"eval_samples_per_second": 25.505, |
|
"eval_steps_per_second": 3.191, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.23711249670677087, |
|
"grad_norm": 9694.1875, |
|
"learning_rate": 2.8221500087827156e-05, |
|
"loss": 0.1271, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.24589444102924388, |
|
"grad_norm": 17376.810546875, |
|
"learning_rate": 2.8155629720709643e-05, |
|
"loss": 0.1434, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.24589444102924388, |
|
"eval_loss": 0.12065327912569046, |
|
"eval_runtime": 174.9734, |
|
"eval_samples_per_second": 25.49, |
|
"eval_steps_per_second": 3.189, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.2546763853517169, |
|
"grad_norm": 13443.2255859375, |
|
"learning_rate": 2.808975935359213e-05, |
|
"loss": 0.1383, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.26345832967418986, |
|
"grad_norm": 10927.8994140625, |
|
"learning_rate": 2.802388898647462e-05, |
|
"loss": 0.125, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.26345832967418986, |
|
"eval_loss": 0.11999432742595673, |
|
"eval_runtime": 174.9084, |
|
"eval_samples_per_second": 25.499, |
|
"eval_steps_per_second": 3.19, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.27224027399666284, |
|
"grad_norm": 9734.189453125, |
|
"learning_rate": 2.7958018619357106e-05, |
|
"loss": 0.1315, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.2810222183191359, |
|
"grad_norm": 11625.3203125, |
|
"learning_rate": 2.7892148252239593e-05, |
|
"loss": 0.1376, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2810222183191359, |
|
"eval_loss": 0.11961102485656738, |
|
"eval_runtime": 175.2567, |
|
"eval_samples_per_second": 25.448, |
|
"eval_steps_per_second": 3.184, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.28980416264160885, |
|
"grad_norm": 10136.365234375, |
|
"learning_rate": 2.782627788512208e-05, |
|
"loss": 0.1245, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.29858610696408183, |
|
"grad_norm": 9877.0, |
|
"learning_rate": 2.776040751800457e-05, |
|
"loss": 0.1366, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.29858610696408183, |
|
"eval_loss": 0.11874815076589584, |
|
"eval_runtime": 175.0761, |
|
"eval_samples_per_second": 25.475, |
|
"eval_steps_per_second": 3.187, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.30736805128655487, |
|
"grad_norm": 13225.7958984375, |
|
"learning_rate": 2.7694537150887056e-05, |
|
"loss": 0.1309, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.31614999560902785, |
|
"grad_norm": 21314.796875, |
|
"learning_rate": 2.762866678376954e-05, |
|
"loss": 0.1257, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.31614999560902785, |
|
"eval_loss": 0.11850052326917648, |
|
"eval_runtime": 174.9576, |
|
"eval_samples_per_second": 25.492, |
|
"eval_steps_per_second": 3.189, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3249319399315008, |
|
"grad_norm": 11650.2890625, |
|
"learning_rate": 2.756279641665203e-05, |
|
"loss": 0.1339, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.3337138842539738, |
|
"grad_norm": 11948.90625, |
|
"learning_rate": 2.7496926049534516e-05, |
|
"loss": 0.1239, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3337138842539738, |
|
"eval_loss": 0.1181233748793602, |
|
"eval_runtime": 174.7123, |
|
"eval_samples_per_second": 25.528, |
|
"eval_steps_per_second": 3.194, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.34249582857644684, |
|
"grad_norm": 13416.5234375, |
|
"learning_rate": 2.7431055682417006e-05, |
|
"loss": 0.128, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.3512777728989198, |
|
"grad_norm": 19758.439453125, |
|
"learning_rate": 2.736518531529949e-05, |
|
"loss": 0.1281, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3512777728989198, |
|
"eval_loss": 0.1172366663813591, |
|
"eval_runtime": 175.0951, |
|
"eval_samples_per_second": 25.472, |
|
"eval_steps_per_second": 3.187, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3600597172213928, |
|
"grad_norm": 9282.8935546875, |
|
"learning_rate": 2.729931494818198e-05, |
|
"loss": 0.1247, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.36884166154386583, |
|
"grad_norm": 13347.216796875, |
|
"learning_rate": 2.7233444581064466e-05, |
|
"loss": 0.1256, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.36884166154386583, |
|
"eval_loss": 0.11689984053373337, |
|
"eval_runtime": 175.1847, |
|
"eval_samples_per_second": 25.459, |
|
"eval_steps_per_second": 3.185, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.3776236058663388, |
|
"grad_norm": 9560.6240234375, |
|
"learning_rate": 2.7167574213946952e-05, |
|
"loss": 0.1345, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.3864055501888118, |
|
"grad_norm": 11137.072265625, |
|
"learning_rate": 2.710170384682944e-05, |
|
"loss": 0.1173, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.3864055501888118, |
|
"eval_loss": 0.11689111590385437, |
|
"eval_runtime": 174.8217, |
|
"eval_samples_per_second": 25.512, |
|
"eval_steps_per_second": 3.192, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.3951874945112848, |
|
"grad_norm": 8313.1650390625, |
|
"learning_rate": 2.703583347971193e-05, |
|
"loss": 0.1214, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.4039694388337578, |
|
"grad_norm": 14535.1669921875, |
|
"learning_rate": 2.6969963112594416e-05, |
|
"loss": 0.1291, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.4039694388337578, |
|
"eval_loss": 0.11614538729190826, |
|
"eval_runtime": 174.9499, |
|
"eval_samples_per_second": 25.493, |
|
"eval_steps_per_second": 3.189, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.4127513831562308, |
|
"grad_norm": 14502.2197265625, |
|
"learning_rate": 2.6904092745476902e-05, |
|
"loss": 0.1223, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.4215333274787038, |
|
"grad_norm": 13744.8466796875, |
|
"learning_rate": 2.683822237835939e-05, |
|
"loss": 0.1277, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4215333274787038, |
|
"eval_loss": 0.11536002904176712, |
|
"eval_runtime": 175.288, |
|
"eval_samples_per_second": 25.444, |
|
"eval_steps_per_second": 3.183, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4303152718011768, |
|
"grad_norm": 10811.90625, |
|
"learning_rate": 2.6772352011241876e-05, |
|
"loss": 0.1234, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.43909721612364977, |
|
"grad_norm": 10062.30859375, |
|
"learning_rate": 2.6706481644124366e-05, |
|
"loss": 0.1217, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.43909721612364977, |
|
"eval_loss": 0.11480703204870224, |
|
"eval_runtime": 175.079, |
|
"eval_samples_per_second": 25.474, |
|
"eval_steps_per_second": 3.187, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.44787916044612275, |
|
"grad_norm": 9090.5810546875, |
|
"learning_rate": 2.664061127700685e-05, |
|
"loss": 0.1183, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.4566611047685958, |
|
"grad_norm": 13992.6572265625, |
|
"learning_rate": 2.657474090988934e-05, |
|
"loss": 0.1204, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.4566611047685958, |
|
"eval_loss": 0.11466159671545029, |
|
"eval_runtime": 175.0561, |
|
"eval_samples_per_second": 25.478, |
|
"eval_steps_per_second": 3.188, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.46544304909106876, |
|
"grad_norm": 10754.3505859375, |
|
"learning_rate": 2.6508870542771825e-05, |
|
"loss": 0.1233, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.47422499341354174, |
|
"grad_norm": 10475.4765625, |
|
"learning_rate": 2.6443000175654315e-05, |
|
"loss": 0.1226, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.47422499341354174, |
|
"eval_loss": 0.11424204707145691, |
|
"eval_runtime": 174.9273, |
|
"eval_samples_per_second": 25.496, |
|
"eval_steps_per_second": 3.19, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.4830069377360148, |
|
"grad_norm": 16770.1015625, |
|
"learning_rate": 2.63771298085368e-05, |
|
"loss": 0.1331, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.49178888205848775, |
|
"grad_norm": 193283.25, |
|
"learning_rate": 2.631125944141929e-05, |
|
"loss": 0.1193, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.49178888205848775, |
|
"eval_loss": 0.11391730606555939, |
|
"eval_runtime": 175.0052, |
|
"eval_samples_per_second": 25.485, |
|
"eval_steps_per_second": 3.188, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5005708263809607, |
|
"grad_norm": 8650.1865234375, |
|
"learning_rate": 2.6245389074301775e-05, |
|
"loss": 0.1307, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.5093527707034338, |
|
"grad_norm": 11343.3427734375, |
|
"learning_rate": 2.6179518707184262e-05, |
|
"loss": 0.1173, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5093527707034338, |
|
"eval_loss": 0.11382684111595154, |
|
"eval_runtime": 175.1185, |
|
"eval_samples_per_second": 25.468, |
|
"eval_steps_per_second": 3.186, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5181347150259067, |
|
"grad_norm": 9844.486328125, |
|
"learning_rate": 2.611364834006675e-05, |
|
"loss": 0.1229, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.5269166593483797, |
|
"grad_norm": 8915.7255859375, |
|
"learning_rate": 2.6047777972949235e-05, |
|
"loss": 0.125, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5269166593483797, |
|
"eval_loss": 0.11349175125360489, |
|
"eval_runtime": 175.2134, |
|
"eval_samples_per_second": 25.455, |
|
"eval_steps_per_second": 3.185, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5356986036708528, |
|
"grad_norm": 9072.705078125, |
|
"learning_rate": 2.5981907605831725e-05, |
|
"loss": 0.1249, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.5444805479933257, |
|
"grad_norm": 11936.0400390625, |
|
"learning_rate": 2.591603723871421e-05, |
|
"loss": 0.1205, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5444805479933257, |
|
"eval_loss": 0.11320458352565765, |
|
"eval_runtime": 174.8633, |
|
"eval_samples_per_second": 25.506, |
|
"eval_steps_per_second": 3.191, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5532624923157987, |
|
"grad_norm": 11488.6748046875, |
|
"learning_rate": 2.58501668715967e-05, |
|
"loss": 0.1202, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.5620444366382717, |
|
"grad_norm": 12126.583984375, |
|
"learning_rate": 2.5784296504479185e-05, |
|
"loss": 0.12, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5620444366382717, |
|
"eval_loss": 0.11316446959972382, |
|
"eval_runtime": 174.862, |
|
"eval_samples_per_second": 25.506, |
|
"eval_steps_per_second": 3.191, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5708263809607447, |
|
"grad_norm": 14138.3876953125, |
|
"learning_rate": 2.5718426137361675e-05, |
|
"loss": 0.1272, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.5796083252832177, |
|
"grad_norm": 6703.96728515625, |
|
"learning_rate": 2.5652555770244158e-05, |
|
"loss": 0.1245, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.5796083252832177, |
|
"eval_loss": 0.11248895525932312, |
|
"eval_runtime": 175.0938, |
|
"eval_samples_per_second": 25.472, |
|
"eval_steps_per_second": 3.187, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.5883902696056907, |
|
"grad_norm": 14042.6962890625, |
|
"learning_rate": 2.5586685403126648e-05, |
|
"loss": 0.1135, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.5971722139281637, |
|
"grad_norm": 11223.4375, |
|
"learning_rate": 2.5520815036009135e-05, |
|
"loss": 0.1192, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.5971722139281637, |
|
"eval_loss": 0.11230960488319397, |
|
"eval_runtime": 175.0029, |
|
"eval_samples_per_second": 25.485, |
|
"eval_steps_per_second": 3.189, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6059541582506367, |
|
"grad_norm": 10186.056640625, |
|
"learning_rate": 2.5454944668891625e-05, |
|
"loss": 0.1133, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.6147361025731097, |
|
"grad_norm": 11724.2939453125, |
|
"learning_rate": 2.5389074301774108e-05, |
|
"loss": 0.1191, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6147361025731097, |
|
"eval_loss": 0.11222900450229645, |
|
"eval_runtime": 174.8917, |
|
"eval_samples_per_second": 25.501, |
|
"eval_steps_per_second": 3.191, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6235180468955827, |
|
"grad_norm": 13531.3447265625, |
|
"learning_rate": 2.5323203934656598e-05, |
|
"loss": 0.1178, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.6322999912180557, |
|
"grad_norm": 10695.3486328125, |
|
"learning_rate": 2.5257333567539085e-05, |
|
"loss": 0.1175, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6322999912180557, |
|
"eval_loss": 0.11206092685461044, |
|
"eval_runtime": 174.9921, |
|
"eval_samples_per_second": 25.487, |
|
"eval_steps_per_second": 3.189, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6410819355405287, |
|
"grad_norm": 9945.763671875, |
|
"learning_rate": 2.5191463200421568e-05, |
|
"loss": 0.1122, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.6498638798630016, |
|
"grad_norm": 21472.962890625, |
|
"learning_rate": 2.5125592833304058e-05, |
|
"loss": 0.1179, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6498638798630016, |
|
"eval_loss": 0.11144611984491348, |
|
"eval_runtime": 174.9363, |
|
"eval_samples_per_second": 25.495, |
|
"eval_steps_per_second": 3.19, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6586458241854747, |
|
"grad_norm": 10160.6298828125, |
|
"learning_rate": 2.5059722466186545e-05, |
|
"loss": 0.1148, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.6674277685079476, |
|
"grad_norm": 13687.66015625, |
|
"learning_rate": 2.4993852099069035e-05, |
|
"loss": 0.1166, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6674277685079476, |
|
"eval_loss": 0.1111949160695076, |
|
"eval_runtime": 174.9122, |
|
"eval_samples_per_second": 25.499, |
|
"eval_steps_per_second": 3.19, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6762097128304206, |
|
"grad_norm": 9961.3818359375, |
|
"learning_rate": 2.4927981731951518e-05, |
|
"loss": 0.12, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.6849916571528937, |
|
"grad_norm": 11952.0546875, |
|
"learning_rate": 2.4862111364834008e-05, |
|
"loss": 0.1227, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.6849916571528937, |
|
"eval_loss": 0.11115138977766037, |
|
"eval_runtime": 175.0741, |
|
"eval_samples_per_second": 25.475, |
|
"eval_steps_per_second": 3.187, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.6937736014753666, |
|
"grad_norm": 7658.37255859375, |
|
"learning_rate": 2.4796240997716494e-05, |
|
"loss": 0.1178, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.7025555457978396, |
|
"grad_norm": 9078.0087890625, |
|
"learning_rate": 2.4730370630598984e-05, |
|
"loss": 0.1079, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7025555457978396, |
|
"eval_loss": 0.11094118654727936, |
|
"eval_runtime": 174.8603, |
|
"eval_samples_per_second": 25.506, |
|
"eval_steps_per_second": 3.191, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7113374901203127, |
|
"grad_norm": 10273.943359375, |
|
"learning_rate": 2.4664500263481468e-05, |
|
"loss": 0.1122, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.7201194344427856, |
|
"grad_norm": 9615.3408203125, |
|
"learning_rate": 2.4598629896363958e-05, |
|
"loss": 0.1178, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.7201194344427856, |
|
"eval_loss": 0.11066293716430664, |
|
"eval_runtime": 176.4782, |
|
"eval_samples_per_second": 25.272, |
|
"eval_steps_per_second": 3.162, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.7289013787652586, |
|
"grad_norm": 9801.9638671875, |
|
"learning_rate": 2.4532759529246444e-05, |
|
"loss": 0.1235, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.7376833230877317, |
|
"grad_norm": 14902.2216796875, |
|
"learning_rate": 2.446688916212893e-05, |
|
"loss": 0.1178, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7376833230877317, |
|
"eval_loss": 0.1100853979587555, |
|
"eval_runtime": 176.5072, |
|
"eval_samples_per_second": 25.268, |
|
"eval_steps_per_second": 3.161, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7464652674102046, |
|
"grad_norm": 11338.7451171875, |
|
"learning_rate": 2.4401018795011417e-05, |
|
"loss": 0.1139, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.7552472117326776, |
|
"grad_norm": 27772.08203125, |
|
"learning_rate": 2.4335148427893904e-05, |
|
"loss": 0.1165, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.7552472117326776, |
|
"eval_loss": 0.11026333272457123, |
|
"eval_runtime": 176.5703, |
|
"eval_samples_per_second": 25.259, |
|
"eval_steps_per_second": 3.16, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.7640291560551506, |
|
"grad_norm": 15169.2919921875, |
|
"learning_rate": 2.4269278060776394e-05, |
|
"loss": 0.1223, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.7728111003776236, |
|
"grad_norm": 9459.4482421875, |
|
"learning_rate": 2.4203407693658877e-05, |
|
"loss": 0.1139, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7728111003776236, |
|
"eval_loss": 0.11010286957025528, |
|
"eval_runtime": 176.8628, |
|
"eval_samples_per_second": 25.217, |
|
"eval_steps_per_second": 3.155, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7815930447000966, |
|
"grad_norm": 10122.501953125, |
|
"learning_rate": 2.4137537326541367e-05, |
|
"loss": 0.1143, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.7903749890225696, |
|
"grad_norm": 8584.2705078125, |
|
"learning_rate": 2.4071666959423854e-05, |
|
"loss": 0.1104, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.7903749890225696, |
|
"eval_loss": 0.11007058620452881, |
|
"eval_runtime": 176.6787, |
|
"eval_samples_per_second": 25.244, |
|
"eval_steps_per_second": 3.158, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.7991569333450426, |
|
"grad_norm": 18708.171875, |
|
"learning_rate": 2.4005796592306344e-05, |
|
"loss": 0.1165, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.8079388776675156, |
|
"grad_norm": 7859.576171875, |
|
"learning_rate": 2.3939926225188827e-05, |
|
"loss": 0.1147, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8079388776675156, |
|
"eval_loss": 0.10995937138795853, |
|
"eval_runtime": 176.8489, |
|
"eval_samples_per_second": 25.219, |
|
"eval_steps_per_second": 3.155, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8167208219899886, |
|
"grad_norm": 27483.9140625, |
|
"learning_rate": 2.3874055858071317e-05, |
|
"loss": 0.1224, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.8255027663124616, |
|
"grad_norm": 8125.94580078125, |
|
"learning_rate": 2.3808185490953804e-05, |
|
"loss": 0.1112, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.8255027663124616, |
|
"eval_loss": 0.10972581803798676, |
|
"eval_runtime": 176.1825, |
|
"eval_samples_per_second": 25.315, |
|
"eval_steps_per_second": 3.167, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.8342847106349346, |
|
"grad_norm": 20998.330078125, |
|
"learning_rate": 2.3742315123836294e-05, |
|
"loss": 0.1213, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.8430666549574076, |
|
"grad_norm": 7832.6513671875, |
|
"learning_rate": 2.3676444756718777e-05, |
|
"loss": 0.1163, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.8430666549574076, |
|
"eval_loss": 0.10929498076438904, |
|
"eval_runtime": 175.008, |
|
"eval_samples_per_second": 25.485, |
|
"eval_steps_per_second": 3.188, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.8518485992798805, |
|
"grad_norm": 10396.4267578125, |
|
"learning_rate": 2.3610574389601264e-05, |
|
"loss": 0.1056, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.8606305436023536, |
|
"grad_norm": 10345.142578125, |
|
"learning_rate": 2.3544704022483754e-05, |
|
"loss": 0.1131, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.8606305436023536, |
|
"eval_loss": 0.10942210257053375, |
|
"eval_runtime": 175.0447, |
|
"eval_samples_per_second": 25.479, |
|
"eval_steps_per_second": 3.188, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.8694124879248265, |
|
"grad_norm": 7617.70703125, |
|
"learning_rate": 2.3478833655366237e-05, |
|
"loss": 0.1033, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.8781944322472995, |
|
"grad_norm": 8898.314453125, |
|
"learning_rate": 2.3412963288248727e-05, |
|
"loss": 0.1168, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8781944322472995, |
|
"eval_loss": 0.10880845785140991, |
|
"eval_runtime": 175.2118, |
|
"eval_samples_per_second": 25.455, |
|
"eval_steps_per_second": 3.185, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8869763765697726, |
|
"grad_norm": 12492.4248046875, |
|
"learning_rate": 2.3347092921131214e-05, |
|
"loss": 0.1151, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.8957583208922455, |
|
"grad_norm": 10120.4833984375, |
|
"learning_rate": 2.3281222554013704e-05, |
|
"loss": 0.1111, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.8957583208922455, |
|
"eval_loss": 0.10875380039215088, |
|
"eval_runtime": 175.9563, |
|
"eval_samples_per_second": 25.347, |
|
"eval_steps_per_second": 3.171, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.9045402652147185, |
|
"grad_norm": 6685.44287109375, |
|
"learning_rate": 2.3215352186896187e-05, |
|
"loss": 0.1114, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.9133222095371916, |
|
"grad_norm": 7174.9296875, |
|
"learning_rate": 2.3149481819778677e-05, |
|
"loss": 0.1212, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.9133222095371916, |
|
"eval_loss": 0.10833785682916641, |
|
"eval_runtime": 176.2427, |
|
"eval_samples_per_second": 25.306, |
|
"eval_steps_per_second": 3.166, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.9221041538596645, |
|
"grad_norm": 12618.1103515625, |
|
"learning_rate": 2.3083611452661163e-05, |
|
"loss": 0.1043, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.9308860981821375, |
|
"grad_norm": 12976.857421875, |
|
"learning_rate": 2.3017741085543653e-05, |
|
"loss": 0.1183, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.9308860981821375, |
|
"eval_loss": 0.10824151337146759, |
|
"eval_runtime": 176.4342, |
|
"eval_samples_per_second": 25.279, |
|
"eval_steps_per_second": 3.163, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.9396680425046106, |
|
"grad_norm": 14113.46875, |
|
"learning_rate": 2.2951870718426137e-05, |
|
"loss": 0.1166, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.9484499868270835, |
|
"grad_norm": 14832.5478515625, |
|
"learning_rate": 2.2886000351308623e-05, |
|
"loss": 0.122, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.9484499868270835, |
|
"eval_loss": 0.10793043673038483, |
|
"eval_runtime": 175.8031, |
|
"eval_samples_per_second": 25.369, |
|
"eval_steps_per_second": 3.174, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.9572319311495565, |
|
"grad_norm": 11024.25390625, |
|
"learning_rate": 2.2820129984191113e-05, |
|
"loss": 0.1156, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.9660138754720295, |
|
"grad_norm": 13093.2275390625, |
|
"learning_rate": 2.27542596170736e-05, |
|
"loss": 0.1154, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.9660138754720295, |
|
"eval_loss": 0.10771273821592331, |
|
"eval_runtime": 176.0575, |
|
"eval_samples_per_second": 25.333, |
|
"eval_steps_per_second": 3.169, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.9747958197945025, |
|
"grad_norm": 18551.283203125, |
|
"learning_rate": 2.2688389249956087e-05, |
|
"loss": 0.1112, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.9835777641169755, |
|
"grad_norm": 16615.962890625, |
|
"learning_rate": 2.2622518882838573e-05, |
|
"loss": 0.1103, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.9835777641169755, |
|
"eval_loss": 0.10789492726325989, |
|
"eval_runtime": 176.2712, |
|
"eval_samples_per_second": 25.302, |
|
"eval_steps_per_second": 3.166, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.9923597084394485, |
|
"grad_norm": 10686.6455078125, |
|
"learning_rate": 2.2556648515721063e-05, |
|
"loss": 0.1134, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.0011416527619215, |
|
"grad_norm": 8200.20703125, |
|
"learning_rate": 2.2490778148603546e-05, |
|
"loss": 0.1085, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.0011416527619215, |
|
"eval_loss": 0.10795657336711884, |
|
"eval_runtime": 175.8416, |
|
"eval_samples_per_second": 25.364, |
|
"eval_steps_per_second": 3.173, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.0099235970843945, |
|
"grad_norm": 12689.986328125, |
|
"learning_rate": 2.2424907781486036e-05, |
|
"loss": 0.1051, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 1.0187055414068675, |
|
"grad_norm": 14943.79296875, |
|
"learning_rate": 2.2359037414368523e-05, |
|
"loss": 0.1086, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.0187055414068675, |
|
"eval_loss": 0.10745207220315933, |
|
"eval_runtime": 176.5973, |
|
"eval_samples_per_second": 25.255, |
|
"eval_steps_per_second": 3.16, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.0274874857293406, |
|
"grad_norm": 9522.888671875, |
|
"learning_rate": 2.2293167047251013e-05, |
|
"loss": 0.1163, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 1.0362694300518134, |
|
"grad_norm": 10599.6796875, |
|
"learning_rate": 2.2227296680133496e-05, |
|
"loss": 0.1048, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.0362694300518134, |
|
"eval_loss": 0.10774970054626465, |
|
"eval_runtime": 175.7784, |
|
"eval_samples_per_second": 25.373, |
|
"eval_steps_per_second": 3.174, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.0450513743742864, |
|
"grad_norm": 11812.033203125, |
|
"learning_rate": 2.2161426313015986e-05, |
|
"loss": 0.1103, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 1.0538333186967594, |
|
"grad_norm": 14652.2265625, |
|
"learning_rate": 2.2095555945898473e-05, |
|
"loss": 0.1059, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.0538333186967594, |
|
"eval_loss": 0.10774527490139008, |
|
"eval_runtime": 175.7637, |
|
"eval_samples_per_second": 25.375, |
|
"eval_steps_per_second": 3.175, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.0626152630192325, |
|
"grad_norm": 12538.1015625, |
|
"learning_rate": 2.202968557878096e-05, |
|
"loss": 0.102, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 1.0713972073417055, |
|
"grad_norm": 12819.8525390625, |
|
"learning_rate": 2.1963815211663446e-05, |
|
"loss": 0.1066, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.0713972073417055, |
|
"eval_loss": 0.1075778678059578, |
|
"eval_runtime": 176.0587, |
|
"eval_samples_per_second": 25.332, |
|
"eval_steps_per_second": 3.169, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.0801791516641785, |
|
"grad_norm": 12523.4482421875, |
|
"learning_rate": 2.1897944844545933e-05, |
|
"loss": 0.1052, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 1.0889610959866514, |
|
"grad_norm": 12486.875, |
|
"learning_rate": 2.1832074477428423e-05, |
|
"loss": 0.1186, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.0889610959866514, |
|
"eval_loss": 0.10704567283391953, |
|
"eval_runtime": 176.4421, |
|
"eval_samples_per_second": 25.277, |
|
"eval_steps_per_second": 3.163, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.0977430403091244, |
|
"grad_norm": 9313.5810546875, |
|
"learning_rate": 2.176620411031091e-05, |
|
"loss": 0.1096, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 1.1065249846315974, |
|
"grad_norm": 11183.017578125, |
|
"learning_rate": 2.1700333743193396e-05, |
|
"loss": 0.0989, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.1065249846315974, |
|
"eval_loss": 0.10751615464687347, |
|
"eval_runtime": 176.3617, |
|
"eval_samples_per_second": 25.289, |
|
"eval_steps_per_second": 3.164, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.1153069289540705, |
|
"grad_norm": 6714.72705078125, |
|
"learning_rate": 2.1634463376075883e-05, |
|
"loss": 0.1104, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 1.1240888732765435, |
|
"grad_norm": 10059.513671875, |
|
"learning_rate": 2.1568593008958373e-05, |
|
"loss": 0.1057, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.1240888732765435, |
|
"eval_loss": 0.10703834146261215, |
|
"eval_runtime": 176.3406, |
|
"eval_samples_per_second": 25.292, |
|
"eval_steps_per_second": 3.164, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.1328708175990165, |
|
"grad_norm": 11930.1201171875, |
|
"learning_rate": 2.1502722641840856e-05, |
|
"loss": 0.1128, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 1.1416527619214893, |
|
"grad_norm": 18080.43359375, |
|
"learning_rate": 2.1436852274723346e-05, |
|
"loss": 0.1043, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.1416527619214893, |
|
"eval_loss": 0.10702774673700333, |
|
"eval_runtime": 176.6705, |
|
"eval_samples_per_second": 25.245, |
|
"eval_steps_per_second": 3.158, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.1504347062439624, |
|
"grad_norm": 9161.2529296875, |
|
"learning_rate": 2.1370981907605832e-05, |
|
"loss": 0.1037, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 1.1592166505664354, |
|
"grad_norm": 9676.08203125, |
|
"learning_rate": 2.130511154048832e-05, |
|
"loss": 0.1113, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.1592166505664354, |
|
"eval_loss": 0.10694678127765656, |
|
"eval_runtime": 176.4583, |
|
"eval_samples_per_second": 25.275, |
|
"eval_steps_per_second": 3.162, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.1679985948889084, |
|
"grad_norm": 10895.8876953125, |
|
"learning_rate": 2.1239241173370806e-05, |
|
"loss": 0.1029, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 1.1767805392113815, |
|
"grad_norm": 10269.076171875, |
|
"learning_rate": 2.1173370806253292e-05, |
|
"loss": 0.1086, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.1767805392113815, |
|
"eval_loss": 0.10676951706409454, |
|
"eval_runtime": 176.5328, |
|
"eval_samples_per_second": 25.264, |
|
"eval_steps_per_second": 3.161, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.1855624835338543, |
|
"grad_norm": 9631.9169921875, |
|
"learning_rate": 2.1107500439135782e-05, |
|
"loss": 0.111, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 1.1943444278563273, |
|
"grad_norm": 8764.7451171875, |
|
"learning_rate": 2.104163007201827e-05, |
|
"loss": 0.1066, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.1943444278563273, |
|
"eval_loss": 0.10679937154054642, |
|
"eval_runtime": 176.7169, |
|
"eval_samples_per_second": 25.238, |
|
"eval_steps_per_second": 3.158, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.2031263721788004, |
|
"grad_norm": 12220.5439453125, |
|
"learning_rate": 2.0975759704900756e-05, |
|
"loss": 0.1106, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 1.2119083165012734, |
|
"grad_norm": 13508.0205078125, |
|
"learning_rate": 2.0909889337783242e-05, |
|
"loss": 0.1115, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.2119083165012734, |
|
"eval_loss": 0.10662820190191269, |
|
"eval_runtime": 176.4182, |
|
"eval_samples_per_second": 25.281, |
|
"eval_steps_per_second": 3.163, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.2206902608237464, |
|
"grad_norm": 11431.49609375, |
|
"learning_rate": 2.0844018970665732e-05, |
|
"loss": 0.1042, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 1.2294722051462195, |
|
"grad_norm": 8914.7119140625, |
|
"learning_rate": 2.077814860354822e-05, |
|
"loss": 0.1037, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.2294722051462195, |
|
"eval_loss": 0.10637149214744568, |
|
"eval_runtime": 176.33, |
|
"eval_samples_per_second": 25.293, |
|
"eval_steps_per_second": 3.165, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.2382541494686923, |
|
"grad_norm": 32394.365234375, |
|
"learning_rate": 2.0712278236430705e-05, |
|
"loss": 0.1082, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 1.2470360937911653, |
|
"grad_norm": 13041.302734375, |
|
"learning_rate": 2.0646407869313192e-05, |
|
"loss": 0.1118, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.2470360937911653, |
|
"eval_loss": 0.10657413303852081, |
|
"eval_runtime": 176.3827, |
|
"eval_samples_per_second": 25.286, |
|
"eval_steps_per_second": 3.164, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.2558180381136383, |
|
"grad_norm": 9132.3310546875, |
|
"learning_rate": 2.0580537502195682e-05, |
|
"loss": 0.103, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 1.2645999824361114, |
|
"grad_norm": 18166.306640625, |
|
"learning_rate": 2.0514667135078165e-05, |
|
"loss": 0.1108, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.2645999824361114, |
|
"eval_loss": 0.10623560100793839, |
|
"eval_runtime": 176.2458, |
|
"eval_samples_per_second": 25.306, |
|
"eval_steps_per_second": 3.166, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.2733819267585844, |
|
"grad_norm": 12996.111328125, |
|
"learning_rate": 2.0448796767960652e-05, |
|
"loss": 0.1019, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 1.2821638710810572, |
|
"grad_norm": 9010.8212890625, |
|
"learning_rate": 2.0382926400843142e-05, |
|
"loss": 0.1074, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.2821638710810572, |
|
"eval_loss": 0.10622620582580566, |
|
"eval_runtime": 176.3743, |
|
"eval_samples_per_second": 25.287, |
|
"eval_steps_per_second": 3.164, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.2909458154035303, |
|
"grad_norm": 14462.72265625, |
|
"learning_rate": 2.031705603372563e-05, |
|
"loss": 0.1104, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 1.2997277597260033, |
|
"grad_norm": 9121.498046875, |
|
"learning_rate": 2.0251185666608115e-05, |
|
"loss": 0.1141, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.2997277597260033, |
|
"eval_loss": 0.10618162155151367, |
|
"eval_runtime": 176.2451, |
|
"eval_samples_per_second": 25.306, |
|
"eval_steps_per_second": 3.166, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.3085097040484763, |
|
"grad_norm": 13231.1484375, |
|
"learning_rate": 2.0185315299490602e-05, |
|
"loss": 0.1106, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 1.3172916483709494, |
|
"grad_norm": 8816.541015625, |
|
"learning_rate": 2.0119444932373092e-05, |
|
"loss": 0.1065, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.3172916483709494, |
|
"eval_loss": 0.1062735840678215, |
|
"eval_runtime": 176.3513, |
|
"eval_samples_per_second": 25.29, |
|
"eval_steps_per_second": 3.164, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.3260735926934224, |
|
"grad_norm": 9924.986328125, |
|
"learning_rate": 2.005357456525558e-05, |
|
"loss": 0.1018, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 1.3348555370158954, |
|
"grad_norm": 14466.806640625, |
|
"learning_rate": 1.9987704198138065e-05, |
|
"loss": 0.1065, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.3348555370158954, |
|
"eval_loss": 0.105972521007061, |
|
"eval_runtime": 176.3022, |
|
"eval_samples_per_second": 25.297, |
|
"eval_steps_per_second": 3.165, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.3436374813383682, |
|
"grad_norm": 13860.5234375, |
|
"learning_rate": 1.992183383102055e-05, |
|
"loss": 0.1096, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 1.3524194256608413, |
|
"grad_norm": 9354.7333984375, |
|
"learning_rate": 1.985596346390304e-05, |
|
"loss": 0.1014, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.3524194256608413, |
|
"eval_loss": 0.10594488680362701, |
|
"eval_runtime": 176.4418, |
|
"eval_samples_per_second": 25.277, |
|
"eval_steps_per_second": 3.163, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.3612013699833143, |
|
"grad_norm": 9179.8173828125, |
|
"learning_rate": 1.9790093096785525e-05, |
|
"loss": 0.0998, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 1.3699833143057873, |
|
"grad_norm": 6730.38134765625, |
|
"learning_rate": 1.972422272966801e-05, |
|
"loss": 0.1029, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.3699833143057873, |
|
"eval_loss": 0.10599970072507858, |
|
"eval_runtime": 176.3495, |
|
"eval_samples_per_second": 25.291, |
|
"eval_steps_per_second": 3.164, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.3787652586282602, |
|
"grad_norm": 9362.3427734375, |
|
"learning_rate": 1.96583523625505e-05, |
|
"loss": 0.1092, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 1.3875472029507332, |
|
"grad_norm": 10184.53125, |
|
"learning_rate": 1.9592481995432988e-05, |
|
"loss": 0.1058, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.3875472029507332, |
|
"eval_loss": 0.1059907078742981, |
|
"eval_runtime": 176.2866, |
|
"eval_samples_per_second": 25.3, |
|
"eval_steps_per_second": 3.165, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.3963291472732062, |
|
"grad_norm": 11201.806640625, |
|
"learning_rate": 1.9526611628315475e-05, |
|
"loss": 0.1143, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 1.4051110915956793, |
|
"grad_norm": 6871.8662109375, |
|
"learning_rate": 1.946074126119796e-05, |
|
"loss": 0.1009, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.4051110915956793, |
|
"eval_loss": 0.10532288253307343, |
|
"eval_runtime": 176.6701, |
|
"eval_samples_per_second": 25.245, |
|
"eval_steps_per_second": 3.158, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.4138930359181523, |
|
"grad_norm": 9399.228515625, |
|
"learning_rate": 1.939487089408045e-05, |
|
"loss": 0.1049, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 1.4226749802406253, |
|
"grad_norm": 19030.01953125, |
|
"learning_rate": 1.9329000526962938e-05, |
|
"loss": 0.1074, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.4226749802406253, |
|
"eval_loss": 0.10543525218963623, |
|
"eval_runtime": 176.5483, |
|
"eval_samples_per_second": 25.262, |
|
"eval_steps_per_second": 3.161, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.4314569245630984, |
|
"grad_norm": 11648.6005859375, |
|
"learning_rate": 1.9263130159845425e-05, |
|
"loss": 0.0996, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 1.4402388688855714, |
|
"grad_norm": 7726.56494140625, |
|
"learning_rate": 1.919725979272791e-05, |
|
"loss": 0.104, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.4402388688855714, |
|
"eval_loss": 0.10535960644483566, |
|
"eval_runtime": 176.4434, |
|
"eval_samples_per_second": 25.277, |
|
"eval_steps_per_second": 3.162, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.4490208132080442, |
|
"grad_norm": 9487.5029296875, |
|
"learning_rate": 1.91313894256104e-05, |
|
"loss": 0.1026, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 1.4578027575305172, |
|
"grad_norm": 8190.107421875, |
|
"learning_rate": 1.9065519058492888e-05, |
|
"loss": 0.1058, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.4578027575305172, |
|
"eval_loss": 0.10518208146095276, |
|
"eval_runtime": 175.766, |
|
"eval_samples_per_second": 25.375, |
|
"eval_steps_per_second": 3.175, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.4665847018529903, |
|
"grad_norm": 15317.1435546875, |
|
"learning_rate": 1.8999648691375374e-05, |
|
"loss": 0.1107, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 1.4753666461754633, |
|
"grad_norm": 8872.7119140625, |
|
"learning_rate": 1.893377832425786e-05, |
|
"loss": 0.1039, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.4753666461754633, |
|
"eval_loss": 0.10524547100067139, |
|
"eval_runtime": 175.2642, |
|
"eval_samples_per_second": 25.447, |
|
"eval_steps_per_second": 3.184, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.4841485904979361, |
|
"grad_norm": 12482.90234375, |
|
"learning_rate": 1.8867907957140348e-05, |
|
"loss": 0.1038, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 1.4929305348204092, |
|
"grad_norm": 12325.9970703125, |
|
"learning_rate": 1.8802037590022834e-05, |
|
"loss": 0.1113, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.4929305348204092, |
|
"eval_loss": 0.10507776588201523, |
|
"eval_runtime": 175.5813, |
|
"eval_samples_per_second": 25.401, |
|
"eval_steps_per_second": 3.178, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.5017124791428822, |
|
"grad_norm": 9354.1494140625, |
|
"learning_rate": 1.873616722290532e-05, |
|
"loss": 0.1087, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 1.5104944234653552, |
|
"grad_norm": 171124.34375, |
|
"learning_rate": 1.867029685578781e-05, |
|
"loss": 0.1106, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.5104944234653552, |
|
"eval_loss": 0.10511680692434311, |
|
"eval_runtime": 175.6013, |
|
"eval_samples_per_second": 25.398, |
|
"eval_steps_per_second": 3.178, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.5192763677878283, |
|
"grad_norm": 10542.4892578125, |
|
"learning_rate": 1.8604426488670297e-05, |
|
"loss": 0.1042, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 1.5280583121103013, |
|
"grad_norm": 8730.197265625, |
|
"learning_rate": 1.8538556121552784e-05, |
|
"loss": 0.1028, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.5280583121103013, |
|
"eval_loss": 0.10496073216199875, |
|
"eval_runtime": 175.5259, |
|
"eval_samples_per_second": 25.409, |
|
"eval_steps_per_second": 3.179, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.5368402564327743, |
|
"grad_norm": 22947.765625, |
|
"learning_rate": 1.847268575443527e-05, |
|
"loss": 0.1106, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 1.5456222007552474, |
|
"grad_norm": 12794.203125, |
|
"learning_rate": 1.840681538731776e-05, |
|
"loss": 0.1027, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.5456222007552474, |
|
"eval_loss": 0.10489310324192047, |
|
"eval_runtime": 175.3848, |
|
"eval_samples_per_second": 25.43, |
|
"eval_steps_per_second": 3.182, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.5544041450777202, |
|
"grad_norm": 9543.232421875, |
|
"learning_rate": 1.8340945020200247e-05, |
|
"loss": 0.107, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 1.5631860894001932, |
|
"grad_norm": 7341.599609375, |
|
"learning_rate": 1.8275074653082734e-05, |
|
"loss": 0.0986, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.5631860894001932, |
|
"eval_loss": 0.10493362694978714, |
|
"eval_runtime": 175.8527, |
|
"eval_samples_per_second": 25.362, |
|
"eval_steps_per_second": 3.173, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.5719680337226662, |
|
"grad_norm": 9900.4501953125, |
|
"learning_rate": 1.820920428596522e-05, |
|
"loss": 0.1101, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 1.580749978045139, |
|
"grad_norm": 9512.732421875, |
|
"learning_rate": 1.8143333918847707e-05, |
|
"loss": 0.1014, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.580749978045139, |
|
"eval_loss": 0.10463293641805649, |
|
"eval_runtime": 175.3499, |
|
"eval_samples_per_second": 25.435, |
|
"eval_steps_per_second": 3.182, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.589531922367612, |
|
"grad_norm": 9556.822265625, |
|
"learning_rate": 1.8077463551730197e-05, |
|
"loss": 0.1015, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 1.5983138666900851, |
|
"grad_norm": 9589.861328125, |
|
"learning_rate": 1.801159318461268e-05, |
|
"loss": 0.1054, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.5983138666900851, |
|
"eval_loss": 0.10453452169895172, |
|
"eval_runtime": 175.1783, |
|
"eval_samples_per_second": 25.46, |
|
"eval_steps_per_second": 3.185, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.6070958110125582, |
|
"grad_norm": 10552.5419921875, |
|
"learning_rate": 1.794572281749517e-05, |
|
"loss": 0.109, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 1.6158777553350312, |
|
"grad_norm": 14104.455078125, |
|
"learning_rate": 1.7879852450377657e-05, |
|
"loss": 0.1037, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.6158777553350312, |
|
"eval_loss": 0.10470784455537796, |
|
"eval_runtime": 175.1306, |
|
"eval_samples_per_second": 25.467, |
|
"eval_steps_per_second": 3.186, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.6246596996575042, |
|
"grad_norm": 10874.2314453125, |
|
"learning_rate": 1.7813982083260144e-05, |
|
"loss": 0.1034, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 1.6334416439799773, |
|
"grad_norm": 10729.14453125, |
|
"learning_rate": 1.774811171614263e-05, |
|
"loss": 0.1063, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.6334416439799773, |
|
"eval_loss": 0.10444886237382889, |
|
"eval_runtime": 175.0317, |
|
"eval_samples_per_second": 25.481, |
|
"eval_steps_per_second": 3.188, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.6422235883024503, |
|
"grad_norm": 13631.76953125, |
|
"learning_rate": 1.768224134902512e-05, |
|
"loss": 0.1051, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 1.6510055326249231, |
|
"grad_norm": 10832.173828125, |
|
"learning_rate": 1.7616370981907607e-05, |
|
"loss": 0.1073, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.6510055326249231, |
|
"eval_loss": 0.1044657751917839, |
|
"eval_runtime": 175.9228, |
|
"eval_samples_per_second": 25.352, |
|
"eval_steps_per_second": 3.172, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.6597874769473961, |
|
"grad_norm": 12738.5595703125, |
|
"learning_rate": 1.7550500614790094e-05, |
|
"loss": 0.1032, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 1.6685694212698692, |
|
"grad_norm": 11901.1611328125, |
|
"learning_rate": 1.748463024767258e-05, |
|
"loss": 0.1033, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.6685694212698692, |
|
"eval_loss": 0.10441263765096664, |
|
"eval_runtime": 176.1189, |
|
"eval_samples_per_second": 25.324, |
|
"eval_steps_per_second": 3.168, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.677351365592342, |
|
"grad_norm": 11122.455078125, |
|
"learning_rate": 1.741875988055507e-05, |
|
"loss": 0.1077, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 1.686133309914815, |
|
"grad_norm": 11424.095703125, |
|
"learning_rate": 1.7352889513437557e-05, |
|
"loss": 0.1009, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.686133309914815, |
|
"eval_loss": 0.10410206019878387, |
|
"eval_runtime": 175.8515, |
|
"eval_samples_per_second": 25.362, |
|
"eval_steps_per_second": 3.173, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.694915254237288, |
|
"grad_norm": 12446.83203125, |
|
"learning_rate": 1.728701914632004e-05, |
|
"loss": 0.0972, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 1.703697198559761, |
|
"grad_norm": 6987.2763671875, |
|
"learning_rate": 1.722114877920253e-05, |
|
"loss": 0.1053, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.703697198559761, |
|
"eval_loss": 0.10402592271566391, |
|
"eval_runtime": 176.2025, |
|
"eval_samples_per_second": 25.312, |
|
"eval_steps_per_second": 3.167, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.7124791428822341, |
|
"grad_norm": 12312.5595703125, |
|
"learning_rate": 1.7155278412085017e-05, |
|
"loss": 0.1049, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 1.7212610872047072, |
|
"grad_norm": 12246.89453125, |
|
"learning_rate": 1.7089408044967507e-05, |
|
"loss": 0.1031, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.7212610872047072, |
|
"eval_loss": 0.10409308969974518, |
|
"eval_runtime": 176.2211, |
|
"eval_samples_per_second": 25.309, |
|
"eval_steps_per_second": 3.166, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.7300430315271802, |
|
"grad_norm": 15511.779296875, |
|
"learning_rate": 1.702353767784999e-05, |
|
"loss": 0.0989, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 1.7388249758496532, |
|
"grad_norm": 11267.55859375, |
|
"learning_rate": 1.695766731073248e-05, |
|
"loss": 0.1051, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 1.7388249758496532, |
|
"eval_loss": 0.10400799661874771, |
|
"eval_runtime": 175.5444, |
|
"eval_samples_per_second": 25.407, |
|
"eval_steps_per_second": 3.179, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 1.7476069201721263, |
|
"grad_norm": 13126.3349609375, |
|
"learning_rate": 1.6891796943614966e-05, |
|
"loss": 0.1057, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 1.756388864494599, |
|
"grad_norm": 11091.8955078125, |
|
"learning_rate": 1.6825926576497453e-05, |
|
"loss": 0.1087, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.756388864494599, |
|
"eval_loss": 0.10402125865221024, |
|
"eval_runtime": 175.4715, |
|
"eval_samples_per_second": 25.417, |
|
"eval_steps_per_second": 3.18, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.7651708088170721, |
|
"grad_norm": 11781.5146484375, |
|
"learning_rate": 1.676005620937994e-05, |
|
"loss": 0.1053, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 1.7739527531395451, |
|
"grad_norm": 9562.85546875, |
|
"learning_rate": 1.669418584226243e-05, |
|
"loss": 0.102, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 1.7739527531395451, |
|
"eval_loss": 0.10393357276916504, |
|
"eval_runtime": 175.3102, |
|
"eval_samples_per_second": 25.441, |
|
"eval_steps_per_second": 3.183, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 1.782734697462018, |
|
"grad_norm": 11889.0390625, |
|
"learning_rate": 1.6628315475144916e-05, |
|
"loss": 0.1068, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 1.791516641784491, |
|
"grad_norm": 9615.0771484375, |
|
"learning_rate": 1.65624451080274e-05, |
|
"loss": 0.1006, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 1.791516641784491, |
|
"eval_loss": 0.10415999591350555, |
|
"eval_runtime": 175.3578, |
|
"eval_samples_per_second": 25.434, |
|
"eval_steps_per_second": 3.182, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 1.800298586106964, |
|
"grad_norm": 11045.595703125, |
|
"learning_rate": 1.649657474090989e-05, |
|
"loss": 0.1099, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 1.809080530429437, |
|
"grad_norm": 13070.5498046875, |
|
"learning_rate": 1.6430704373792376e-05, |
|
"loss": 0.1113, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 1.809080530429437, |
|
"eval_loss": 0.10374840348958969, |
|
"eval_runtime": 175.4676, |
|
"eval_samples_per_second": 25.418, |
|
"eval_steps_per_second": 3.18, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 1.81786247475191, |
|
"grad_norm": 7781.587890625, |
|
"learning_rate": 1.6364834006674866e-05, |
|
"loss": 0.1021, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 1.8266444190743831, |
|
"grad_norm": 8406.8466796875, |
|
"learning_rate": 1.629896363955735e-05, |
|
"loss": 0.1003, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 1.8266444190743831, |
|
"eval_loss": 0.10402803868055344, |
|
"eval_runtime": 175.179, |
|
"eval_samples_per_second": 25.46, |
|
"eval_steps_per_second": 3.185, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 1.8354263633968562, |
|
"grad_norm": 12187.5849609375, |
|
"learning_rate": 1.623309327243984e-05, |
|
"loss": 0.1022, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 1.8442083077193292, |
|
"grad_norm": 8159.96435546875, |
|
"learning_rate": 1.6167222905322326e-05, |
|
"loss": 0.1066, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.8442083077193292, |
|
"eval_loss": 0.10364160686731339, |
|
"eval_runtime": 175.267, |
|
"eval_samples_per_second": 25.447, |
|
"eval_steps_per_second": 3.184, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.852990252041802, |
|
"grad_norm": 11498.6494140625, |
|
"learning_rate": 1.6101352538204813e-05, |
|
"loss": 0.1037, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 1.861772196364275, |
|
"grad_norm": 11206.7646484375, |
|
"learning_rate": 1.60354821710873e-05, |
|
"loss": 0.1041, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 1.861772196364275, |
|
"eval_loss": 0.10378584265708923, |
|
"eval_runtime": 175.3196, |
|
"eval_samples_per_second": 25.439, |
|
"eval_steps_per_second": 3.183, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 1.870554140686748, |
|
"grad_norm": 13556.3330078125, |
|
"learning_rate": 1.596961180396979e-05, |
|
"loss": 0.0996, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 1.879336085009221, |
|
"grad_norm": 7290.71240234375, |
|
"learning_rate": 1.5903741436852276e-05, |
|
"loss": 0.1069, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 1.879336085009221, |
|
"eval_loss": 0.10361269861459732, |
|
"eval_runtime": 175.3025, |
|
"eval_samples_per_second": 25.442, |
|
"eval_steps_per_second": 3.183, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 1.888118029331694, |
|
"grad_norm": 10433.58203125, |
|
"learning_rate": 1.5837871069734763e-05, |
|
"loss": 0.1039, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 1.896899973654167, |
|
"grad_norm": 7497.98193359375, |
|
"learning_rate": 1.577200070261725e-05, |
|
"loss": 0.0992, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 1.896899973654167, |
|
"eval_loss": 0.10356434434652328, |
|
"eval_runtime": 175.2851, |
|
"eval_samples_per_second": 25.444, |
|
"eval_steps_per_second": 3.183, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 1.90568191797664, |
|
"grad_norm": 7763.9208984375, |
|
"learning_rate": 1.5706130335499736e-05, |
|
"loss": 0.0997, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 1.914463862299113, |
|
"grad_norm": 8243.4501953125, |
|
"learning_rate": 1.5640259968382226e-05, |
|
"loss": 0.1053, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 1.914463862299113, |
|
"eval_loss": 0.10348707437515259, |
|
"eval_runtime": 175.331, |
|
"eval_samples_per_second": 25.438, |
|
"eval_steps_per_second": 3.183, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 1.923245806621586, |
|
"grad_norm": 10855.509765625, |
|
"learning_rate": 1.557438960126471e-05, |
|
"loss": 0.1072, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 1.932027750944059, |
|
"grad_norm": 11178.33203125, |
|
"learning_rate": 1.55085192341472e-05, |
|
"loss": 0.1074, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.932027750944059, |
|
"eval_loss": 0.1035689190030098, |
|
"eval_runtime": 175.1907, |
|
"eval_samples_per_second": 25.458, |
|
"eval_steps_per_second": 3.185, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.9408096952665321, |
|
"grad_norm": 15381.65625, |
|
"learning_rate": 1.5442648867029686e-05, |
|
"loss": 0.104, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 1.9495916395890052, |
|
"grad_norm": 11851.7646484375, |
|
"learning_rate": 1.5376778499912176e-05, |
|
"loss": 0.1021, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 1.9495916395890052, |
|
"eval_loss": 0.1033787652850151, |
|
"eval_runtime": 175.4375, |
|
"eval_samples_per_second": 25.422, |
|
"eval_steps_per_second": 3.181, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 1.958373583911478, |
|
"grad_norm": 10821.587890625, |
|
"learning_rate": 1.531090813279466e-05, |
|
"loss": 0.105, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 1.967155528233951, |
|
"grad_norm": 10268.7001953125, |
|
"learning_rate": 1.5245037765677149e-05, |
|
"loss": 0.0995, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 1.967155528233951, |
|
"eval_loss": 0.1031695231795311, |
|
"eval_runtime": 175.3915, |
|
"eval_samples_per_second": 25.429, |
|
"eval_steps_per_second": 3.181, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 1.975937472556424, |
|
"grad_norm": 15241.69140625, |
|
"learning_rate": 1.5179167398559635e-05, |
|
"loss": 0.1001, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 1.9847194168788969, |
|
"grad_norm": 9984.0908203125, |
|
"learning_rate": 1.5113297031442124e-05, |
|
"loss": 0.1051, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 1.9847194168788969, |
|
"eval_loss": 0.10308495908975601, |
|
"eval_runtime": 175.2586, |
|
"eval_samples_per_second": 25.448, |
|
"eval_steps_per_second": 3.184, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 1.99350136120137, |
|
"grad_norm": 10116.9619140625, |
|
"learning_rate": 1.5047426664324609e-05, |
|
"loss": 0.102, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 2.002283305523843, |
|
"grad_norm": 13065.00390625, |
|
"learning_rate": 1.4981556297207097e-05, |
|
"loss": 0.1029, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 2.002283305523843, |
|
"eval_loss": 0.10323852300643921, |
|
"eval_runtime": 175.2291, |
|
"eval_samples_per_second": 25.452, |
|
"eval_steps_per_second": 3.184, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 2.011065249846316, |
|
"grad_norm": 10087.5390625, |
|
"learning_rate": 1.4915685930089584e-05, |
|
"loss": 0.1009, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 2.019847194168789, |
|
"grad_norm": 9332.802734375, |
|
"learning_rate": 1.4849815562972072e-05, |
|
"loss": 0.093, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.019847194168789, |
|
"eval_loss": 0.10356967151165009, |
|
"eval_runtime": 175.1976, |
|
"eval_samples_per_second": 25.457, |
|
"eval_steps_per_second": 3.185, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.028629138491262, |
|
"grad_norm": 8534.212890625, |
|
"learning_rate": 1.4783945195854559e-05, |
|
"loss": 0.0969, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 2.037411082813735, |
|
"grad_norm": 10853.4990234375, |
|
"learning_rate": 1.4718074828737047e-05, |
|
"loss": 0.1016, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 2.037411082813735, |
|
"eval_loss": 0.10368319600820541, |
|
"eval_runtime": 175.2089, |
|
"eval_samples_per_second": 25.455, |
|
"eval_steps_per_second": 3.185, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 2.046193027136208, |
|
"grad_norm": 10575.98828125, |
|
"learning_rate": 1.4652204461619534e-05, |
|
"loss": 0.0993, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 2.054974971458681, |
|
"grad_norm": 17123.625, |
|
"learning_rate": 1.458633409450202e-05, |
|
"loss": 0.1011, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 2.054974971458681, |
|
"eval_loss": 0.10356248915195465, |
|
"eval_runtime": 175.2585, |
|
"eval_samples_per_second": 25.448, |
|
"eval_steps_per_second": 3.184, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 2.0637569157811537, |
|
"grad_norm": 13095.1728515625, |
|
"learning_rate": 1.4520463727384507e-05, |
|
"loss": 0.0956, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 2.0725388601036268, |
|
"grad_norm": 11280.3291015625, |
|
"learning_rate": 1.4454593360266995e-05, |
|
"loss": 0.0969, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 2.0725388601036268, |
|
"eval_loss": 0.10367120802402496, |
|
"eval_runtime": 175.3325, |
|
"eval_samples_per_second": 25.437, |
|
"eval_steps_per_second": 3.183, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 2.0813208044261, |
|
"grad_norm": 8861.095703125, |
|
"learning_rate": 1.4388722993149482e-05, |
|
"loss": 0.0971, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 2.090102748748573, |
|
"grad_norm": 15480.5634765625, |
|
"learning_rate": 1.432285262603197e-05, |
|
"loss": 0.0977, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 2.090102748748573, |
|
"eval_loss": 0.1037474200129509, |
|
"eval_runtime": 175.4966, |
|
"eval_samples_per_second": 25.414, |
|
"eval_steps_per_second": 3.18, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 2.098884693071046, |
|
"grad_norm": 10802.611328125, |
|
"learning_rate": 1.4256982258914457e-05, |
|
"loss": 0.0979, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 2.107666637393519, |
|
"grad_norm": 7810.14111328125, |
|
"learning_rate": 1.4191111891796943e-05, |
|
"loss": 0.0963, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.107666637393519, |
|
"eval_loss": 0.10353059321641922, |
|
"eval_runtime": 175.3863, |
|
"eval_samples_per_second": 25.43, |
|
"eval_steps_per_second": 3.182, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.116448581715992, |
|
"grad_norm": 10527.5751953125, |
|
"learning_rate": 1.4125241524679432e-05, |
|
"loss": 0.0997, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 2.125230526038465, |
|
"grad_norm": 12505.5380859375, |
|
"learning_rate": 1.4059371157561918e-05, |
|
"loss": 0.0981, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 2.125230526038465, |
|
"eval_loss": 0.10333307832479477, |
|
"eval_runtime": 175.2906, |
|
"eval_samples_per_second": 25.443, |
|
"eval_steps_per_second": 3.183, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 2.134012470360938, |
|
"grad_norm": 9851.1923828125, |
|
"learning_rate": 1.3993500790444406e-05, |
|
"loss": 0.1006, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 2.142794414683411, |
|
"grad_norm": 9354.9697265625, |
|
"learning_rate": 1.3927630423326893e-05, |
|
"loss": 0.1032, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 2.142794414683411, |
|
"eval_loss": 0.10333764553070068, |
|
"eval_runtime": 175.147, |
|
"eval_samples_per_second": 25.464, |
|
"eval_steps_per_second": 3.186, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 2.151576359005884, |
|
"grad_norm": 7880.865234375, |
|
"learning_rate": 1.3861760056209381e-05, |
|
"loss": 0.0957, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 2.160358303328357, |
|
"grad_norm": 17636.8515625, |
|
"learning_rate": 1.3795889689091866e-05, |
|
"loss": 0.0952, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 2.160358303328357, |
|
"eval_loss": 0.10335990786552429, |
|
"eval_runtime": 175.3211, |
|
"eval_samples_per_second": 25.439, |
|
"eval_steps_per_second": 3.183, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 2.1691402476508297, |
|
"grad_norm": 15586.3701171875, |
|
"learning_rate": 1.3730019321974355e-05, |
|
"loss": 0.098, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 2.1779221919733027, |
|
"grad_norm": 11448.01953125, |
|
"learning_rate": 1.3664148954856841e-05, |
|
"loss": 0.0942, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 2.1779221919733027, |
|
"eval_loss": 0.10320650041103363, |
|
"eval_runtime": 175.2718, |
|
"eval_samples_per_second": 25.446, |
|
"eval_steps_per_second": 3.184, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 2.1867041362957758, |
|
"grad_norm": 13402.5732421875, |
|
"learning_rate": 1.359827858773933e-05, |
|
"loss": 0.1016, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 2.195486080618249, |
|
"grad_norm": 7308.1123046875, |
|
"learning_rate": 1.3532408220621816e-05, |
|
"loss": 0.0978, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.195486080618249, |
|
"eval_loss": 0.10325244069099426, |
|
"eval_runtime": 175.4131, |
|
"eval_samples_per_second": 25.426, |
|
"eval_steps_per_second": 3.181, |
|
"step": 12500 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 22772, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.089152635076608e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|