|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 3375, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005925925925925926, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.9629629629629632e-08, |
|
"loss": 1.9405, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.011851851851851851, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 5.9259259259259263e-08, |
|
"loss": 1.9345, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.017777777777777778, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 8.88888888888889e-08, |
|
"loss": 2.0371, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.023703703703703703, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 1.1851851851851853e-07, |
|
"loss": 1.9599, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02962962962962963, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.4814814814814817e-07, |
|
"loss": 1.9826, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.035555555555555556, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 1.777777777777778e-07, |
|
"loss": 1.9455, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04148148148148148, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.074074074074074e-07, |
|
"loss": 2.0185, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.047407407407407405, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.3703703703703705e-07, |
|
"loss": 1.8877, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 2.666666666666667e-07, |
|
"loss": 2.0742, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.05925925925925926, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 2.9629629629629634e-07, |
|
"loss": 1.9178, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06518518518518518, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.259259259259259e-07, |
|
"loss": 1.8998, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.07111111111111111, |
|
"grad_norm": 0.75, |
|
"learning_rate": 3.555555555555556e-07, |
|
"loss": 2.0155, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.07703703703703704, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.8518518518518525e-07, |
|
"loss": 2.0011, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.08296296296296296, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 4.148148148148148e-07, |
|
"loss": 1.9424, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.444444444444445e-07, |
|
"loss": 1.8711, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09481481481481481, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 4.740740740740741e-07, |
|
"loss": 1.902, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.10074074074074074, |
|
"grad_norm": 0.625, |
|
"learning_rate": 5.037037037037038e-07, |
|
"loss": 1.942, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 5.333333333333335e-07, |
|
"loss": 1.8852, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.11259259259259259, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 5.62962962962963e-07, |
|
"loss": 1.9527, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.11851851851851852, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 5.925925925925927e-07, |
|
"loss": 1.8927, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.12444444444444444, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 6.222222222222223e-07, |
|
"loss": 1.9382, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.13037037037037036, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 6.518518518518518e-07, |
|
"loss": 1.906, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.1362962962962963, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 6.814814814814816e-07, |
|
"loss": 1.851, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.14222222222222222, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 7.111111111111112e-07, |
|
"loss": 1.901, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 7.407407407407407e-07, |
|
"loss": 1.851, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.15407407407407409, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 7.703703703703705e-07, |
|
"loss": 1.809, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 1.8846, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.16592592592592592, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 8.296296296296296e-07, |
|
"loss": 1.8922, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.17185185185185184, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 8.592592592592593e-07, |
|
"loss": 1.9092, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 8.88888888888889e-07, |
|
"loss": 1.868, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1837037037037037, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 9.185185185185185e-07, |
|
"loss": 1.8768, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.18962962962962962, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 9.481481481481482e-07, |
|
"loss": 1.8321, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.19555555555555557, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 9.77777777777778e-07, |
|
"loss": 1.8788, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.20148148148148148, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.0074074074074076e-06, |
|
"loss": 1.8518, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.2074074074074074, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 1.0370370370370371e-06, |
|
"loss": 1.8652, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 1.066666666666667e-06, |
|
"loss": 1.7996, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.21925925925925926, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 1.0962962962962965e-06, |
|
"loss": 1.8566, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.22518518518518518, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 1.125925925925926e-06, |
|
"loss": 1.8537, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.2311111111111111, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 1.1555555555555556e-06, |
|
"loss": 1.8452, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.23703703703703705, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 1.1851851851851854e-06, |
|
"loss": 1.8393, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.24296296296296296, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 1.214814814814815e-06, |
|
"loss": 1.7721, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.24888888888888888, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 1.2444444444444445e-06, |
|
"loss": 1.8168, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2548148148148148, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 1.2740740740740743e-06, |
|
"loss": 1.8316, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2607407407407407, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 1.3037037037037036e-06, |
|
"loss": 1.7708, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 1.7678, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2725925925925926, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 1.3629629629629632e-06, |
|
"loss": 1.85, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.2785185185185185, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.3925925925925925e-06, |
|
"loss": 1.8612, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.28444444444444444, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 1.4222222222222223e-06, |
|
"loss": 1.8431, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2903703703703704, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 1.451851851851852e-06, |
|
"loss": 1.8036, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 1.4814814814814815e-06, |
|
"loss": 1.7861, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3022222222222222, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 1.5111111111111112e-06, |
|
"loss": 1.8524, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.30814814814814817, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 1.540740740740741e-06, |
|
"loss": 1.7964, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.31407407407407406, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 1.5703703703703704e-06, |
|
"loss": 1.894, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 1.7428, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.32592592592592595, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 1.62962962962963e-06, |
|
"loss": 1.7648, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.33185185185185184, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.6592592592592593e-06, |
|
"loss": 1.7671, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.3377777777777778, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.688888888888889e-06, |
|
"loss": 1.8195, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.3437037037037037, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 1.7185185185185186e-06, |
|
"loss": 1.7553, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.3496296296296296, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 1.7481481481481482e-06, |
|
"loss": 1.8649, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 1.777777777777778e-06, |
|
"loss": 1.6996, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.36148148148148146, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 1.8074074074074075e-06, |
|
"loss": 1.8027, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3674074074074074, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 1.837037037037037e-06, |
|
"loss": 1.7646, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 1.8666666666666669e-06, |
|
"loss": 1.6897, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.37925925925925924, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 1.8962962962962964e-06, |
|
"loss": 1.7192, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.3851851851851852, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 1.925925925925926e-06, |
|
"loss": 1.7491, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.39111111111111113, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 1.955555555555556e-06, |
|
"loss": 1.6689, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.397037037037037, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 1.985185185185185e-06, |
|
"loss": 1.6721, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.40296296296296297, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 2.014814814814815e-06, |
|
"loss": 1.7495, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.4088888888888889, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 2.0444444444444447e-06, |
|
"loss": 1.7475, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.4148148148148148, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 2.0740740740740742e-06, |
|
"loss": 1.7103, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.42074074074074075, |
|
"grad_norm": 0.5, |
|
"learning_rate": 2.103703703703704e-06, |
|
"loss": 1.6456, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 2.133333333333334e-06, |
|
"loss": 1.7045, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.4325925925925926, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 2.162962962962963e-06, |
|
"loss": 1.6505, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.43851851851851853, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 2.192592592592593e-06, |
|
"loss": 1.5497, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 1.5977, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.45037037037037037, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 2.251851851851852e-06, |
|
"loss": 1.5589, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.4562962962962963, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 2.2814814814814816e-06, |
|
"loss": 1.5472, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.4622222222222222, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 2.311111111111111e-06, |
|
"loss": 1.6132, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.46814814814814815, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 2.3407407407407408e-06, |
|
"loss": 1.586, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.4740740740740741, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 2.3703703703703707e-06, |
|
"loss": 1.5127, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 1.5141, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.48592592592592593, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 2.42962962962963e-06, |
|
"loss": 1.5157, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4918518518518519, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 2.4592592592592594e-06, |
|
"loss": 1.5395, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.49777777777777776, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 2.488888888888889e-06, |
|
"loss": 1.4586, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.5037037037037037, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 2.5185185185185186e-06, |
|
"loss": 1.4836, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5096296296296297, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 2.5481481481481486e-06, |
|
"loss": 1.5109, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.5155555555555555, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 2.577777777777778e-06, |
|
"loss": 1.5414, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5214814814814814, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.6074074074074073e-06, |
|
"loss": 1.5813, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.5274074074074074, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 2.6370370370370373e-06, |
|
"loss": 1.5222, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 1.5081, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5392592592592592, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 2.6962962962962964e-06, |
|
"loss": 1.5026, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.5451851851851852, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 2.7259259259259264e-06, |
|
"loss": 1.5438, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.5511111111111111, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 2.755555555555556e-06, |
|
"loss": 1.476, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.557037037037037, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 2.785185185185185e-06, |
|
"loss": 1.4509, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.562962962962963, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 2.814814814814815e-06, |
|
"loss": 1.5576, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5688888888888889, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 2.8444444444444446e-06, |
|
"loss": 1.5297, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.5748148148148148, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 2.874074074074074e-06, |
|
"loss": 1.4491, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.5807407407407408, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 2.903703703703704e-06, |
|
"loss": 1.59, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 2.9333333333333338e-06, |
|
"loss": 1.4926, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 2.962962962962963e-06, |
|
"loss": 1.5031, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5985185185185186, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 2.992592592592593e-06, |
|
"loss": 1.4425, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.6044444444444445, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 3.0222222222222225e-06, |
|
"loss": 1.5279, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.6103703703703703, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 3.051851851851852e-06, |
|
"loss": 1.5437, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.6162962962962963, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 3.081481481481482e-06, |
|
"loss": 1.4972, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 3.1111111111111116e-06, |
|
"loss": 1.4935, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6281481481481481, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 3.1407407407407407e-06, |
|
"loss": 1.595, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.6340740740740741, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 3.1703703703703707e-06, |
|
"loss": 1.4306, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 1.5468, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.6459259259259259, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 3.22962962962963e-06, |
|
"loss": 1.477, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.6518518518518519, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 3.25925925925926e-06, |
|
"loss": 1.4971, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6577777777777778, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 3.2888888888888894e-06, |
|
"loss": 1.4715, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.6637037037037037, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 3.3185185185185185e-06, |
|
"loss": 1.4916, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.6696296296296296, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.348148148148148e-06, |
|
"loss": 1.5192, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.6755555555555556, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 3.377777777777778e-06, |
|
"loss": 1.4304, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.6814814814814815, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 3.4074074074074077e-06, |
|
"loss": 1.5796, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6874074074074074, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 3.4370370370370372e-06, |
|
"loss": 1.5835, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 3.4666666666666672e-06, |
|
"loss": 1.4515, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.6992592592592592, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 3.4962962962962964e-06, |
|
"loss": 1.4513, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.7051851851851851, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 3.525925925925926e-06, |
|
"loss": 1.5401, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 3.555555555555556e-06, |
|
"loss": 1.5242, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.717037037037037, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 3.5851851851851855e-06, |
|
"loss": 1.4171, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.7229629629629629, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 3.614814814814815e-06, |
|
"loss": 1.4914, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.7288888888888889, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 3.644444444444445e-06, |
|
"loss": 1.4023, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.7348148148148148, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 3.674074074074074e-06, |
|
"loss": 1.5467, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.25, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 1.5131, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.7333333333333337e-06, |
|
"loss": 1.4868, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.7525925925925926, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 3.7629629629629633e-06, |
|
"loss": 1.5638, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.7585185185185185, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 3.792592592592593e-06, |
|
"loss": 1.5138, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.7644444444444445, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 3.8222222222222224e-06, |
|
"loss": 1.4604, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.7703703703703704, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 3.851851851851852e-06, |
|
"loss": 1.4806, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7762962962962963, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 3.8814814814814816e-06, |
|
"loss": 1.5501, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.7822222222222223, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 3.911111111111112e-06, |
|
"loss": 1.5254, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.7881481481481482, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 3.940740740740741e-06, |
|
"loss": 1.5049, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.794074074074074, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 3.97037037037037e-06, |
|
"loss": 1.3749, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.5036, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.8059259259259259, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 4.02962962962963e-06, |
|
"loss": 1.4699, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.8118518518518518, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 4.05925925925926e-06, |
|
"loss": 1.4729, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.8177777777777778, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 4.088888888888889e-06, |
|
"loss": 1.4782, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.8237037037037037, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 4.118518518518519e-06, |
|
"loss": 1.4379, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.8296296296296296, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 4.1481481481481485e-06, |
|
"loss": 1.515, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8355555555555556, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 4.177777777777778e-06, |
|
"loss": 1.466, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.8414814814814815, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 4.207407407407408e-06, |
|
"loss": 1.4852, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.8474074074074074, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 4.237037037037037e-06, |
|
"loss": 1.5604, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 4.266666666666668e-06, |
|
"loss": 1.4395, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.8592592592592593, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 4.296296296296296e-06, |
|
"loss": 1.455, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8651851851851852, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 4.325925925925926e-06, |
|
"loss": 1.5373, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.8711111111111111, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 4.3555555555555555e-06, |
|
"loss": 1.3663, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.8770370370370371, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 4.385185185185186e-06, |
|
"loss": 1.3726, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.882962962962963, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 4.4148148148148154e-06, |
|
"loss": 1.4718, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 1.4882, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8948148148148148, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 4.4740740740740746e-06, |
|
"loss": 1.535, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.9007407407407407, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 4.503703703703704e-06, |
|
"loss": 1.4269, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 4.533333333333334e-06, |
|
"loss": 1.434, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.9125925925925926, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 4.562962962962963e-06, |
|
"loss": 1.4714, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.9185185185185185, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 4.592592592592593e-06, |
|
"loss": 1.4427, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9244444444444444, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 4.622222222222222e-06, |
|
"loss": 1.4902, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.9303703703703704, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 4.651851851851853e-06, |
|
"loss": 1.5026, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.9362962962962963, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 4.6814814814814815e-06, |
|
"loss": 1.4131, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.9422222222222222, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 4.711111111111111e-06, |
|
"loss": 1.4583, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.9481481481481482, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 4.7407407407407415e-06, |
|
"loss": 1.5298, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9540740740740741, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 4.770370370370371e-06, |
|
"loss": 1.4845, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.5329, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.965925925925926, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 4.82962962962963e-06, |
|
"loss": 1.513, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.9718518518518519, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 4.85925925925926e-06, |
|
"loss": 1.4065, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 4.888888888888889e-06, |
|
"loss": 1.4303, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9837037037037037, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 4.918518518518519e-06, |
|
"loss": 1.4025, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.9896296296296296, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 4.9481481481481485e-06, |
|
"loss": 1.5732, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.9955555555555555, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 4.977777777777778e-06, |
|
"loss": 1.5408, |
|
"step": 3360 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 16875, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 4.4539748890148045e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|