|
{ |
|
"best_metric": 0.420650452375412, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-50", |
|
"epoch": 3.016949152542373, |
|
"eval_steps": 50, |
|
"global_step": 89, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03389830508474576, |
|
"grad_norm": 0.6644117832183838, |
|
"learning_rate": 1.0500000000000001e-05, |
|
"loss": 1.2728, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03389830508474576, |
|
"eval_loss": 1.325829029083252, |
|
"eval_runtime": 0.8828, |
|
"eval_samples_per_second": 450.827, |
|
"eval_steps_per_second": 14.725, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06779661016949153, |
|
"grad_norm": 0.7109208106994629, |
|
"learning_rate": 2.1000000000000002e-05, |
|
"loss": 1.3144, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.1016949152542373, |
|
"grad_norm": 0.715027928352356, |
|
"learning_rate": 3.15e-05, |
|
"loss": 1.3375, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.13559322033898305, |
|
"grad_norm": 0.7128281593322754, |
|
"learning_rate": 4.2000000000000004e-05, |
|
"loss": 1.326, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.1694915254237288, |
|
"grad_norm": 0.719323992729187, |
|
"learning_rate": 5.25e-05, |
|
"loss": 1.3235, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.2033898305084746, |
|
"grad_norm": 0.7542026042938232, |
|
"learning_rate": 6.3e-05, |
|
"loss": 1.348, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.23728813559322035, |
|
"grad_norm": 0.6400342583656311, |
|
"learning_rate": 7.35e-05, |
|
"loss": 1.2302, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.2711864406779661, |
|
"grad_norm": 0.6019027829170227, |
|
"learning_rate": 8.400000000000001e-05, |
|
"loss": 1.1686, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.3050847457627119, |
|
"grad_norm": 0.5761123299598694, |
|
"learning_rate": 9.45e-05, |
|
"loss": 1.0889, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3389830508474576, |
|
"grad_norm": 0.5195322036743164, |
|
"learning_rate": 0.000105, |
|
"loss": 1.0254, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3728813559322034, |
|
"grad_norm": 0.5309574604034424, |
|
"learning_rate": 0.00010495849335443335, |
|
"loss": 0.9455, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.4067796610169492, |
|
"grad_norm": 0.5846827030181885, |
|
"learning_rate": 0.00010483403904827154, |
|
"loss": 0.886, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.4406779661016949, |
|
"grad_norm": 0.5385061502456665, |
|
"learning_rate": 0.0001046268338693536, |
|
"loss": 0.8475, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4745762711864407, |
|
"grad_norm": 0.34226641058921814, |
|
"learning_rate": 0.0001043372054516575, |
|
"loss": 0.7105, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5084745762711864, |
|
"grad_norm": 0.5354804992675781, |
|
"learning_rate": 0.0001039656117572434, |
|
"loss": 0.681, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5423728813559322, |
|
"grad_norm": 0.7758191227912903, |
|
"learning_rate": 0.00010351264035212153, |
|
"loss": 0.6268, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.576271186440678, |
|
"grad_norm": 0.4772910475730896, |
|
"learning_rate": 0.00010297900747718958, |
|
"loss": 0.6068, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.6101694915254238, |
|
"grad_norm": 0.39711692929267883, |
|
"learning_rate": 0.0001023655569157086, |
|
"loss": 0.5645, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.6440677966101694, |
|
"grad_norm": 0.6140002608299255, |
|
"learning_rate": 0.00010167325865910821, |
|
"loss": 0.5802, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.6779661016949152, |
|
"grad_norm": 0.24760758876800537, |
|
"learning_rate": 0.00010090320737323084, |
|
"loss": 0.5746, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.711864406779661, |
|
"grad_norm": 0.20960871875286102, |
|
"learning_rate": 0.00010005662066743998, |
|
"loss": 0.5206, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.7457627118644068, |
|
"grad_norm": 0.20185863971710205, |
|
"learning_rate": 9.913483716932943e-05, |
|
"loss": 0.489, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7796610169491526, |
|
"grad_norm": 0.1934608370065689, |
|
"learning_rate": 9.81393144080781e-05, |
|
"loss": 0.4544, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.8135593220338984, |
|
"grad_norm": 0.2110586315393448, |
|
"learning_rate": 9.707162650979662e-05, |
|
"loss": 0.4844, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.847457627118644, |
|
"grad_norm": 0.4501343071460724, |
|
"learning_rate": 9.593346170851051e-05, |
|
"loss": 0.4708, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8813559322033898, |
|
"grad_norm": 0.8199589848518372, |
|
"learning_rate": 9.472661967671516e-05, |
|
"loss": 0.5096, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.9152542372881356, |
|
"grad_norm": 0.42125675082206726, |
|
"learning_rate": 9.345300867972365e-05, |
|
"loss": 0.4668, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.9491525423728814, |
|
"grad_norm": 0.1957239806652069, |
|
"learning_rate": 9.211464255830708e-05, |
|
"loss": 0.4379, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.9830508474576272, |
|
"grad_norm": 0.46031802892684937, |
|
"learning_rate": 9.071363754439846e-05, |
|
"loss": 0.4519, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.0169491525423728, |
|
"grad_norm": 0.7945327758789062, |
|
"learning_rate": 8.925220891489483e-05, |
|
"loss": 0.7032, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0508474576271187, |
|
"grad_norm": 0.13100884854793549, |
|
"learning_rate": 8.773266748884944e-05, |
|
"loss": 0.4579, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.0847457627118644, |
|
"grad_norm": 0.3228919804096222, |
|
"learning_rate": 8.615741597359156e-05, |
|
"loss": 0.4398, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.11864406779661, |
|
"grad_norm": 0.35424545407295227, |
|
"learning_rate": 8.452894516555253e-05, |
|
"loss": 0.4465, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.152542372881356, |
|
"grad_norm": 0.22786343097686768, |
|
"learning_rate": 8.284983001180455e-05, |
|
"loss": 0.4068, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.1864406779661016, |
|
"grad_norm": 0.16856859624385834, |
|
"learning_rate": 8.112272553854005e-05, |
|
"loss": 0.4109, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.2203389830508475, |
|
"grad_norm": 0.29634520411491394, |
|
"learning_rate": 7.935036265292968e-05, |
|
"loss": 0.4125, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.2542372881355932, |
|
"grad_norm": 0.2798968553543091, |
|
"learning_rate": 7.753554382499657e-05, |
|
"loss": 0.4788, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.288135593220339, |
|
"grad_norm": 0.11876481026411057, |
|
"learning_rate": 7.568113865633538e-05, |
|
"loss": 0.4387, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.3220338983050848, |
|
"grad_norm": 0.08868218213319778, |
|
"learning_rate": 7.379007934268217e-05, |
|
"loss": 0.4091, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.3559322033898304, |
|
"grad_norm": 0.22629207372665405, |
|
"learning_rate": 7.18653560375104e-05, |
|
"loss": 0.4052, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.3898305084745763, |
|
"grad_norm": 0.3182011842727661, |
|
"learning_rate": 6.991001212398357e-05, |
|
"loss": 0.4155, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.423728813559322, |
|
"grad_norm": 0.48684900999069214, |
|
"learning_rate": 6.792713940274086e-05, |
|
"loss": 0.3487, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.457627118644068, |
|
"grad_norm": 0.7674040198326111, |
|
"learning_rate": 6.591987320312492e-05, |
|
"loss": 0.5046, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.4915254237288136, |
|
"grad_norm": 0.5276405811309814, |
|
"learning_rate": 6.38913874255817e-05, |
|
"loss": 0.4545, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.5254237288135593, |
|
"grad_norm": 0.12091302126646042, |
|
"learning_rate": 6.184488952307141e-05, |
|
"loss": 0.4387, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.559322033898305, |
|
"grad_norm": 0.17123517394065857, |
|
"learning_rate": 5.9783615429426096e-05, |
|
"loss": 0.4108, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.5932203389830508, |
|
"grad_norm": 0.4185921251773834, |
|
"learning_rate": 5.7710824442673244e-05, |
|
"loss": 0.407, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.6271186440677967, |
|
"grad_norm": 0.5745218992233276, |
|
"learning_rate": 5.562979407141554e-05, |
|
"loss": 0.4002, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.6610169491525424, |
|
"grad_norm": 0.09879444539546967, |
|
"learning_rate": 5.3543814852416186e-05, |
|
"loss": 0.4583, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.694915254237288, |
|
"grad_norm": 0.29606881737709045, |
|
"learning_rate": 5.145618514758382e-05, |
|
"loss": 0.4399, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.694915254237288, |
|
"eval_loss": 0.420650452375412, |
|
"eval_runtime": 1.3705, |
|
"eval_samples_per_second": 290.407, |
|
"eval_steps_per_second": 9.486, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.7288135593220337, |
|
"grad_norm": 0.2548205256462097, |
|
"learning_rate": 4.9370205928584464e-05, |
|
"loss": 0.4107, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.7627118644067796, |
|
"grad_norm": 0.24160364270210266, |
|
"learning_rate": 4.728917555732678e-05, |
|
"loss": 0.4312, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.7966101694915255, |
|
"grad_norm": 0.06337052583694458, |
|
"learning_rate": 4.521638457057392e-05, |
|
"loss": 0.4052, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.8305084745762712, |
|
"grad_norm": 0.26108115911483765, |
|
"learning_rate": 4.315511047692862e-05, |
|
"loss": 0.3744, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.8644067796610169, |
|
"grad_norm": 0.18919679522514343, |
|
"learning_rate": 4.11086125744183e-05, |
|
"loss": 0.4286, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.8983050847457628, |
|
"grad_norm": 0.24806950986385345, |
|
"learning_rate": 3.908012679687508e-05, |
|
"loss": 0.4354, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.9322033898305084, |
|
"grad_norm": 0.16577477753162384, |
|
"learning_rate": 3.707286059725916e-05, |
|
"loss": 0.4216, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.9661016949152543, |
|
"grad_norm": 0.07274139672517776, |
|
"learning_rate": 3.5089987876016445e-05, |
|
"loss": 0.3875, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.7141322493553162, |
|
"learning_rate": 3.313464396248959e-05, |
|
"loss": 0.5832, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.0338983050847457, |
|
"grad_norm": 0.3955691158771515, |
|
"learning_rate": 3.1209920657317824e-05, |
|
"loss": 0.4751, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.0677966101694913, |
|
"grad_norm": 0.37002313137054443, |
|
"learning_rate": 2.9318861343664646e-05, |
|
"loss": 0.4308, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.1016949152542375, |
|
"grad_norm": 0.15991376340389252, |
|
"learning_rate": 2.746445617500344e-05, |
|
"loss": 0.4005, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.135593220338983, |
|
"grad_norm": 0.08792717754840851, |
|
"learning_rate": 2.5649637347070338e-05, |
|
"loss": 0.3849, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.169491525423729, |
|
"grad_norm": 0.14280790090560913, |
|
"learning_rate": 2.3877274461459966e-05, |
|
"loss": 0.3955, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.2033898305084745, |
|
"grad_norm": 0.40407660603523254, |
|
"learning_rate": 2.2150169988195452e-05, |
|
"loss": 0.3807, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.23728813559322, |
|
"grad_norm": 0.05895840376615524, |
|
"learning_rate": 2.0471054834447473e-05, |
|
"loss": 0.4356, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.2711864406779663, |
|
"grad_norm": 0.20810866355895996, |
|
"learning_rate": 1.884258402640846e-05, |
|
"loss": 0.4502, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.305084745762712, |
|
"grad_norm": 0.10166915506124496, |
|
"learning_rate": 1.7267332511150564e-05, |
|
"loss": 0.4194, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.3389830508474576, |
|
"grad_norm": 0.059947896748781204, |
|
"learning_rate": 1.5747791085105175e-05, |
|
"loss": 0.4241, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.3728813559322033, |
|
"grad_norm": 0.08675111830234528, |
|
"learning_rate": 1.428636245560156e-05, |
|
"loss": 0.4057, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.406779661016949, |
|
"grad_norm": 0.2886503040790558, |
|
"learning_rate": 1.2885357441692914e-05, |
|
"loss": 0.3794, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.440677966101695, |
|
"grad_norm": 0.11011155694723129, |
|
"learning_rate": 1.154699132027637e-05, |
|
"loss": 0.4224, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.4745762711864407, |
|
"grad_norm": 0.23668210208415985, |
|
"learning_rate": 1.027338032328485e-05, |
|
"loss": 0.4624, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.5084745762711864, |
|
"grad_norm": 0.1450003683567047, |
|
"learning_rate": 9.066538291489505e-06, |
|
"loss": 0.4063, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.542372881355932, |
|
"grad_norm": 0.0950573980808258, |
|
"learning_rate": 7.928373490203387e-06, |
|
"loss": 0.3955, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.576271186440678, |
|
"grad_norm": 0.1254480630159378, |
|
"learning_rate": 6.86068559192192e-06, |
|
"loss": 0.4191, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.610169491525424, |
|
"grad_norm": 0.2626599371433258, |
|
"learning_rate": 5.865162830670588e-06, |
|
"loss": 0.3544, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.6440677966101696, |
|
"grad_norm": 0.38685888051986694, |
|
"learning_rate": 4.9433793325600285e-06, |
|
"loss": 0.4176, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.6779661016949152, |
|
"grad_norm": 0.3029015362262726, |
|
"learning_rate": 4.096792626769155e-06, |
|
"loss": 0.42, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.711864406779661, |
|
"grad_norm": 0.17327213287353516, |
|
"learning_rate": 3.326741340891792e-06, |
|
"loss": 0.4334, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.7457627118644066, |
|
"grad_norm": 0.10987525433301926, |
|
"learning_rate": 2.6344430842914096e-06, |
|
"loss": 0.4149, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.7796610169491527, |
|
"grad_norm": 0.12840408086776733, |
|
"learning_rate": 2.020992522810428e-06, |
|
"loss": 0.3796, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.8135593220338984, |
|
"grad_norm": 0.19848020374774933, |
|
"learning_rate": 1.487359647878472e-06, |
|
"loss": 0.3997, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.847457627118644, |
|
"grad_norm": 0.4496248960494995, |
|
"learning_rate": 1.0343882427566057e-06, |
|
"loss": 0.3558, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.8813559322033897, |
|
"grad_norm": 0.34108394384384155, |
|
"learning_rate": 6.627945483424986e-07, |
|
"loss": 0.4417, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.915254237288136, |
|
"grad_norm": 0.1465616226196289, |
|
"learning_rate": 3.73166130646396e-07, |
|
"loss": 0.4156, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.9491525423728815, |
|
"grad_norm": 0.08617256581783295, |
|
"learning_rate": 1.6596095172846882e-07, |
|
"loss": 0.4077, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.983050847457627, |
|
"grad_norm": 0.2514786422252655, |
|
"learning_rate": 4.1506645566653235e-08, |
|
"loss": 0.4062, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 3.016949152542373, |
|
"grad_norm": 0.42695701122283936, |
|
"learning_rate": 0.0, |
|
"loss": 0.6505, |
|
"step": 89 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 89, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.890642232639488e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|