|
{ |
|
"best_metric": 2.0890417098999023, |
|
"best_model_checkpoint": "/data/sunggeunan/ICL/src/outputs/Meta-Llama-3-8B-Instruct_qa_ft_QA_mrqa_nq_SQuAD_3shot_1docs/checkpoint-481", |
|
"epoch": 4.997402597402598, |
|
"eval_steps": 100, |
|
"global_step": 481, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01038961038961039, |
|
"grad_norm": 0.25633782148361206, |
|
"learning_rate": 6.666666666666667e-09, |
|
"loss": 2.1094, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02077922077922078, |
|
"grad_norm": 0.26136407256126404, |
|
"learning_rate": 1.3333333333333334e-08, |
|
"loss": 2.1062, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03116883116883117, |
|
"grad_norm": 0.25959405303001404, |
|
"learning_rate": 2e-08, |
|
"loss": 2.1249, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.04155844155844156, |
|
"grad_norm": 0.2641236484050751, |
|
"learning_rate": 2.6666666666666667e-08, |
|
"loss": 2.1248, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05194805194805195, |
|
"grad_norm": 0.2523995637893677, |
|
"learning_rate": 3.3333333333333334e-08, |
|
"loss": 2.1085, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06233766233766234, |
|
"grad_norm": 0.25376567244529724, |
|
"learning_rate": 4e-08, |
|
"loss": 2.0999, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07272727272727272, |
|
"grad_norm": 0.2529865801334381, |
|
"learning_rate": 4.666666666666667e-08, |
|
"loss": 2.1005, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.08311688311688312, |
|
"grad_norm": 0.26591023802757263, |
|
"learning_rate": 5.3333333333333334e-08, |
|
"loss": 2.0975, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.09350649350649351, |
|
"grad_norm": 0.2609612047672272, |
|
"learning_rate": 6e-08, |
|
"loss": 2.1294, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.1038961038961039, |
|
"grad_norm": 0.260165274143219, |
|
"learning_rate": 6.666666666666667e-08, |
|
"loss": 2.0961, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.2504313886165619, |
|
"learning_rate": 7.333333333333333e-08, |
|
"loss": 2.0863, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.12467532467532468, |
|
"grad_norm": 0.2633957266807556, |
|
"learning_rate": 8e-08, |
|
"loss": 2.0822, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.13506493506493505, |
|
"grad_norm": 0.2547496557235718, |
|
"learning_rate": 8.666666666666666e-08, |
|
"loss": 2.1066, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.14545454545454545, |
|
"grad_norm": 0.27546900510787964, |
|
"learning_rate": 9.333333333333334e-08, |
|
"loss": 2.1399, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.15584415584415584, |
|
"grad_norm": 0.25894761085510254, |
|
"learning_rate": 1e-07, |
|
"loss": 2.1071, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.16623376623376623, |
|
"grad_norm": 0.2625376284122467, |
|
"learning_rate": 1.0666666666666667e-07, |
|
"loss": 2.1171, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.17662337662337663, |
|
"grad_norm": 0.25433292984962463, |
|
"learning_rate": 1.1333333333333332e-07, |
|
"loss": 2.0932, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.18701298701298702, |
|
"grad_norm": 0.26560184359550476, |
|
"learning_rate": 1.2e-07, |
|
"loss": 2.0945, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1974025974025974, |
|
"grad_norm": 0.26605260372161865, |
|
"learning_rate": 1.2666666666666666e-07, |
|
"loss": 2.1034, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.2077922077922078, |
|
"grad_norm": 0.2608611583709717, |
|
"learning_rate": 1.3333333333333334e-07, |
|
"loss": 2.0955, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21818181818181817, |
|
"grad_norm": 0.2744869589805603, |
|
"learning_rate": 1.4e-07, |
|
"loss": 2.0934, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.2602550685405731, |
|
"learning_rate": 1.4666666666666666e-07, |
|
"loss": 2.0788, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.23896103896103896, |
|
"grad_norm": 0.2581612467765808, |
|
"learning_rate": 1.533333333333333e-07, |
|
"loss": 2.1258, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.24935064935064935, |
|
"grad_norm": 0.25001809000968933, |
|
"learning_rate": 1.6e-07, |
|
"loss": 2.1034, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 0.2558351457118988, |
|
"learning_rate": 1.6666666666666665e-07, |
|
"loss": 2.1111, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2701298701298701, |
|
"grad_norm": 0.26691997051239014, |
|
"learning_rate": 1.7333333333333332e-07, |
|
"loss": 2.1056, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2805194805194805, |
|
"grad_norm": 0.25776407122612, |
|
"learning_rate": 1.8e-07, |
|
"loss": 2.0942, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.2909090909090909, |
|
"grad_norm": 0.2654891610145569, |
|
"learning_rate": 1.8666666666666667e-07, |
|
"loss": 2.0946, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.3012987012987013, |
|
"grad_norm": 0.2603527009487152, |
|
"learning_rate": 1.9333333333333332e-07, |
|
"loss": 2.1056, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.3116883116883117, |
|
"grad_norm": 0.2545248568058014, |
|
"learning_rate": 2e-07, |
|
"loss": 2.1149, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3220779220779221, |
|
"grad_norm": 0.26618441939353943, |
|
"learning_rate": 2.0666666666666666e-07, |
|
"loss": 2.1127, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.33246753246753247, |
|
"grad_norm": 0.26514533162117004, |
|
"learning_rate": 2.1333333333333334e-07, |
|
"loss": 2.096, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.34285714285714286, |
|
"grad_norm": 0.2551611065864563, |
|
"learning_rate": 2.1999999999999998e-07, |
|
"loss": 2.1051, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.35324675324675325, |
|
"grad_norm": 0.26668792963027954, |
|
"learning_rate": 2.2666666666666663e-07, |
|
"loss": 2.1081, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 0.24424628913402557, |
|
"learning_rate": 2.3333333333333333e-07, |
|
"loss": 2.0856, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.37402597402597404, |
|
"grad_norm": 0.26595574617385864, |
|
"learning_rate": 2.4e-07, |
|
"loss": 2.1143, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.38441558441558443, |
|
"grad_norm": 0.26116102933883667, |
|
"learning_rate": 2.4666666666666665e-07, |
|
"loss": 2.0815, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.3948051948051948, |
|
"grad_norm": 0.2586928904056549, |
|
"learning_rate": 2.533333333333333e-07, |
|
"loss": 2.1031, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.4051948051948052, |
|
"grad_norm": 0.2762044668197632, |
|
"learning_rate": 2.6e-07, |
|
"loss": 2.116, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.4155844155844156, |
|
"grad_norm": 0.2627628445625305, |
|
"learning_rate": 2.6666666666666667e-07, |
|
"loss": 2.1134, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.42597402597402595, |
|
"grad_norm": 0.26995256543159485, |
|
"learning_rate": 2.733333333333333e-07, |
|
"loss": 2.0891, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.43636363636363634, |
|
"grad_norm": 0.25931429862976074, |
|
"learning_rate": 2.8e-07, |
|
"loss": 2.1116, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.44675324675324674, |
|
"grad_norm": 0.25965389609336853, |
|
"learning_rate": 2.866666666666667e-07, |
|
"loss": 2.0947, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 0.2590876519680023, |
|
"learning_rate": 2.933333333333333e-07, |
|
"loss": 2.1066, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.4675324675324675, |
|
"grad_norm": 0.2535057067871094, |
|
"learning_rate": 3e-07, |
|
"loss": 2.0889, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4779220779220779, |
|
"grad_norm": 0.26631295680999756, |
|
"learning_rate": 3.066666666666666e-07, |
|
"loss": 2.0938, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4883116883116883, |
|
"grad_norm": 0.2609468102455139, |
|
"learning_rate": 3.1333333333333333e-07, |
|
"loss": 2.1034, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4987012987012987, |
|
"grad_norm": 0.2554691731929779, |
|
"learning_rate": 3.2e-07, |
|
"loss": 2.1341, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.509090909090909, |
|
"grad_norm": 0.27121829986572266, |
|
"learning_rate": 3.2666666666666663e-07, |
|
"loss": 2.1238, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 0.28885209560394287, |
|
"learning_rate": 3.333333333333333e-07, |
|
"loss": 2.0966, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5298701298701298, |
|
"grad_norm": 0.28006577491760254, |
|
"learning_rate": 3.4000000000000003e-07, |
|
"loss": 2.1542, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5402597402597402, |
|
"grad_norm": 0.26597273349761963, |
|
"learning_rate": 3.4666666666666665e-07, |
|
"loss": 2.1042, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5506493506493506, |
|
"grad_norm": 0.2693743109703064, |
|
"learning_rate": 3.533333333333333e-07, |
|
"loss": 2.1125, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.561038961038961, |
|
"grad_norm": 0.25912410020828247, |
|
"learning_rate": 3.6e-07, |
|
"loss": 2.0925, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.27581265568733215, |
|
"learning_rate": 3.666666666666666e-07, |
|
"loss": 2.0952, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5818181818181818, |
|
"grad_norm": 0.271810382604599, |
|
"learning_rate": 3.7333333333333334e-07, |
|
"loss": 2.1108, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5922077922077922, |
|
"grad_norm": 0.26822298765182495, |
|
"learning_rate": 3.7999999999999996e-07, |
|
"loss": 2.1143, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.6025974025974026, |
|
"grad_norm": 0.27131083607673645, |
|
"learning_rate": 3.8666666666666664e-07, |
|
"loss": 2.1061, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.612987012987013, |
|
"grad_norm": 0.2661900520324707, |
|
"learning_rate": 3.933333333333333e-07, |
|
"loss": 2.1068, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6233766233766234, |
|
"grad_norm": 0.2700493335723877, |
|
"learning_rate": 4e-07, |
|
"loss": 2.1428, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6337662337662338, |
|
"grad_norm": 0.2725152373313904, |
|
"learning_rate": 4.0666666666666666e-07, |
|
"loss": 2.1086, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.6441558441558441, |
|
"grad_norm": 0.27668678760528564, |
|
"learning_rate": 4.1333333333333333e-07, |
|
"loss": 2.1074, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6545454545454545, |
|
"grad_norm": 0.2740931212902069, |
|
"learning_rate": 4.1999999999999995e-07, |
|
"loss": 2.0993, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6649350649350649, |
|
"grad_norm": 0.26940861344337463, |
|
"learning_rate": 4.266666666666667e-07, |
|
"loss": 2.0949, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6753246753246753, |
|
"grad_norm": 0.2628956139087677, |
|
"learning_rate": 4.3333333333333335e-07, |
|
"loss": 2.1142, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6857142857142857, |
|
"grad_norm": 0.25989633798599243, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": 2.0982, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6961038961038961, |
|
"grad_norm": 0.26778894662857056, |
|
"learning_rate": 4.4666666666666664e-07, |
|
"loss": 2.127, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.7064935064935065, |
|
"grad_norm": 0.27109193801879883, |
|
"learning_rate": 4.5333333333333326e-07, |
|
"loss": 2.1218, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.7168831168831169, |
|
"grad_norm": 0.2669210433959961, |
|
"learning_rate": 4.6e-07, |
|
"loss": 2.1008, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.2724508047103882, |
|
"learning_rate": 4.6666666666666666e-07, |
|
"loss": 2.1101, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7376623376623377, |
|
"grad_norm": 0.2761273980140686, |
|
"learning_rate": 4.733333333333333e-07, |
|
"loss": 2.0829, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.7480519480519481, |
|
"grad_norm": 0.27279791235923767, |
|
"learning_rate": 4.8e-07, |
|
"loss": 2.1186, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7584415584415585, |
|
"grad_norm": 0.2563924193382263, |
|
"learning_rate": 4.866666666666666e-07, |
|
"loss": 2.1007, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7688311688311689, |
|
"grad_norm": 0.271611750125885, |
|
"learning_rate": 4.933333333333333e-07, |
|
"loss": 2.0846, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 0.2771332561969757, |
|
"learning_rate": 5e-07, |
|
"loss": 2.0993, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7896103896103897, |
|
"grad_norm": 0.2688583433628082, |
|
"learning_rate": 4.996491228070176e-07, |
|
"loss": 2.0966, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.27472490072250366, |
|
"learning_rate": 4.992982456140351e-07, |
|
"loss": 2.0898, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.8103896103896104, |
|
"grad_norm": 0.2720624506473541, |
|
"learning_rate": 4.989473684210527e-07, |
|
"loss": 2.1063, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.8207792207792208, |
|
"grad_norm": 0.2789762020111084, |
|
"learning_rate": 4.985964912280701e-07, |
|
"loss": 2.1068, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.8311688311688312, |
|
"grad_norm": 0.2771006226539612, |
|
"learning_rate": 4.982456140350877e-07, |
|
"loss": 2.0926, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8415584415584415, |
|
"grad_norm": 0.2923927307128906, |
|
"learning_rate": 4.978947368421052e-07, |
|
"loss": 2.109, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.8519480519480519, |
|
"grad_norm": 0.2825423777103424, |
|
"learning_rate": 4.975438596491228e-07, |
|
"loss": 2.1115, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.8623376623376623, |
|
"grad_norm": 0.28421661257743835, |
|
"learning_rate": 4.971929824561403e-07, |
|
"loss": 2.1057, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.8727272727272727, |
|
"grad_norm": 0.31991952657699585, |
|
"learning_rate": 4.968421052631579e-07, |
|
"loss": 2.0972, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8831168831168831, |
|
"grad_norm": 0.28094005584716797, |
|
"learning_rate": 4.964912280701754e-07, |
|
"loss": 2.1209, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8935064935064935, |
|
"grad_norm": 0.2852063775062561, |
|
"learning_rate": 4.96140350877193e-07, |
|
"loss": 2.1166, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.9038961038961039, |
|
"grad_norm": 0.27665138244628906, |
|
"learning_rate": 4.957894736842105e-07, |
|
"loss": 2.1284, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.9142857142857143, |
|
"grad_norm": 0.2683943808078766, |
|
"learning_rate": 4.954385964912281e-07, |
|
"loss": 2.1049, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.9246753246753247, |
|
"grad_norm": 0.27968302369117737, |
|
"learning_rate": 4.950877192982457e-07, |
|
"loss": 2.1182, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.935064935064935, |
|
"grad_norm": 0.27822941541671753, |
|
"learning_rate": 4.947368421052631e-07, |
|
"loss": 2.1164, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9454545454545454, |
|
"grad_norm": 0.28795087337493896, |
|
"learning_rate": 4.943859649122807e-07, |
|
"loss": 2.1145, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.9558441558441558, |
|
"grad_norm": 0.2840701639652252, |
|
"learning_rate": 4.940350877192982e-07, |
|
"loss": 2.1172, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.9662337662337662, |
|
"grad_norm": 0.2897505462169647, |
|
"learning_rate": 4.936842105263157e-07, |
|
"loss": 2.0802, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.9766233766233766, |
|
"grad_norm": 0.2831282615661621, |
|
"learning_rate": 4.933333333333333e-07, |
|
"loss": 2.0798, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.987012987012987, |
|
"grad_norm": 0.2663356363773346, |
|
"learning_rate": 4.929824561403508e-07, |
|
"loss": 2.0551, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9974025974025974, |
|
"grad_norm": 0.2793254852294922, |
|
"learning_rate": 4.926315789473684e-07, |
|
"loss": 2.1085, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.9974025974025974, |
|
"eval_loss": 2.1119654178619385, |
|
"eval_runtime": 9.2468, |
|
"eval_samples_per_second": 2.704, |
|
"eval_steps_per_second": 0.433, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.0077922077922077, |
|
"grad_norm": 0.2765531837940216, |
|
"learning_rate": 4.92280701754386e-07, |
|
"loss": 2.1192, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.018181818181818, |
|
"grad_norm": 0.29020991921424866, |
|
"learning_rate": 4.919298245614035e-07, |
|
"loss": 2.1216, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.0285714285714285, |
|
"grad_norm": 0.2918996512889862, |
|
"learning_rate": 4.915789473684211e-07, |
|
"loss": 2.1035, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.0389610389610389, |
|
"grad_norm": 0.2785792350769043, |
|
"learning_rate": 4.912280701754385e-07, |
|
"loss": 2.1132, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0493506493506493, |
|
"grad_norm": 0.27520841360092163, |
|
"learning_rate": 4.908771929824561e-07, |
|
"loss": 2.0914, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.0597402597402596, |
|
"grad_norm": 0.2906198799610138, |
|
"learning_rate": 4.905263157894736e-07, |
|
"loss": 2.1184, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.07012987012987, |
|
"grad_norm": 0.29240748286247253, |
|
"learning_rate": 4.901754385964912e-07, |
|
"loss": 2.1002, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.0805194805194804, |
|
"grad_norm": 0.2815570533275604, |
|
"learning_rate": 4.898245614035087e-07, |
|
"loss": 2.1022, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 0.2839779853820801, |
|
"learning_rate": 4.894736842105263e-07, |
|
"loss": 2.0933, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.1012987012987012, |
|
"grad_norm": 0.28621962666511536, |
|
"learning_rate": 4.891228070175438e-07, |
|
"loss": 2.0967, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.1116883116883116, |
|
"grad_norm": 0.28691983222961426, |
|
"learning_rate": 4.887719298245614e-07, |
|
"loss": 2.09, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.122077922077922, |
|
"grad_norm": 0.28074637055397034, |
|
"learning_rate": 4.884210526315789e-07, |
|
"loss": 2.0933, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.1324675324675324, |
|
"grad_norm": 0.27770230174064636, |
|
"learning_rate": 4.880701754385965e-07, |
|
"loss": 2.0589, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.30154216289520264, |
|
"learning_rate": 4.877192982456141e-07, |
|
"loss": 2.1361, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.1532467532467532, |
|
"grad_norm": 0.29188480973243713, |
|
"learning_rate": 4.873684210526315e-07, |
|
"loss": 2.1015, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.1636363636363636, |
|
"grad_norm": 0.2855170965194702, |
|
"learning_rate": 4.870175438596491e-07, |
|
"loss": 2.1281, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.174025974025974, |
|
"grad_norm": 0.2731803059577942, |
|
"learning_rate": 4.866666666666666e-07, |
|
"loss": 2.108, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.1844155844155844, |
|
"grad_norm": 0.29748019576072693, |
|
"learning_rate": 4.863157894736842e-07, |
|
"loss": 2.1409, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.1948051948051948, |
|
"grad_norm": 0.3487149178981781, |
|
"learning_rate": 4.859649122807017e-07, |
|
"loss": 2.1189, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.2051948051948052, |
|
"grad_norm": 0.2950851023197174, |
|
"learning_rate": 4.856140350877193e-07, |
|
"loss": 2.1297, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.2155844155844155, |
|
"grad_norm": 0.28961101174354553, |
|
"learning_rate": 4.852631578947368e-07, |
|
"loss": 2.1161, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.225974025974026, |
|
"grad_norm": 0.2877264618873596, |
|
"learning_rate": 4.849122807017544e-07, |
|
"loss": 2.0852, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.2363636363636363, |
|
"grad_norm": 0.29030388593673706, |
|
"learning_rate": 4.845614035087719e-07, |
|
"loss": 2.1115, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.2467532467532467, |
|
"grad_norm": 0.2916348874568939, |
|
"learning_rate": 4.842105263157895e-07, |
|
"loss": 2.0792, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.2571428571428571, |
|
"grad_norm": 0.2899838984012604, |
|
"learning_rate": 4.838596491228071e-07, |
|
"loss": 2.118, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.2675324675324675, |
|
"grad_norm": 0.28898170590400696, |
|
"learning_rate": 4.835087719298245e-07, |
|
"loss": 2.0958, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.277922077922078, |
|
"grad_norm": 0.29052966833114624, |
|
"learning_rate": 4.831578947368421e-07, |
|
"loss": 2.0883, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.2883116883116883, |
|
"grad_norm": 0.29824158549308777, |
|
"learning_rate": 4.828070175438596e-07, |
|
"loss": 2.1201, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.2987012987012987, |
|
"grad_norm": 0.2876146733760834, |
|
"learning_rate": 4.824561403508772e-07, |
|
"loss": 2.0847, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.309090909090909, |
|
"grad_norm": 0.29399487376213074, |
|
"learning_rate": 4.821052631578947e-07, |
|
"loss": 2.124, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.3194805194805195, |
|
"grad_norm": 0.28400611877441406, |
|
"learning_rate": 4.817543859649122e-07, |
|
"loss": 2.1024, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.3298701298701299, |
|
"grad_norm": 0.2978748083114624, |
|
"learning_rate": 4.814035087719298e-07, |
|
"loss": 2.1282, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.3402597402597403, |
|
"grad_norm": 0.30035582184791565, |
|
"learning_rate": 4.810526315789473e-07, |
|
"loss": 2.103, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.3506493506493507, |
|
"grad_norm": 0.29803967475891113, |
|
"learning_rate": 4.807017543859649e-07, |
|
"loss": 2.075, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.361038961038961, |
|
"grad_norm": 0.2902166247367859, |
|
"learning_rate": 4.803508771929825e-07, |
|
"loss": 2.1254, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.3714285714285714, |
|
"grad_norm": 0.3193589448928833, |
|
"learning_rate": 4.8e-07, |
|
"loss": 2.1075, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.3818181818181818, |
|
"grad_norm": 0.2991260290145874, |
|
"learning_rate": 4.796491228070176e-07, |
|
"loss": 2.101, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.3922077922077922, |
|
"grad_norm": 0.2965753376483917, |
|
"learning_rate": 4.79298245614035e-07, |
|
"loss": 2.1326, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.4025974025974026, |
|
"grad_norm": 0.29871872067451477, |
|
"learning_rate": 4.789473684210526e-07, |
|
"loss": 2.1365, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.412987012987013, |
|
"grad_norm": 0.3004847764968872, |
|
"learning_rate": 4.785964912280701e-07, |
|
"loss": 2.0904, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.4233766233766234, |
|
"grad_norm": 0.3125920295715332, |
|
"learning_rate": 4.782456140350877e-07, |
|
"loss": 2.1208, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.4337662337662338, |
|
"grad_norm": 0.2945519685745239, |
|
"learning_rate": 4.778947368421052e-07, |
|
"loss": 2.1064, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.4441558441558442, |
|
"grad_norm": 0.2995961308479309, |
|
"learning_rate": 4.775438596491228e-07, |
|
"loss": 2.1184, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 0.30309098958969116, |
|
"learning_rate": 4.771929824561403e-07, |
|
"loss": 2.0869, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.464935064935065, |
|
"grad_norm": 0.3021141290664673, |
|
"learning_rate": 4.768421052631579e-07, |
|
"loss": 2.1027, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.4753246753246754, |
|
"grad_norm": 0.3091490864753723, |
|
"learning_rate": 4.7649122807017547e-07, |
|
"loss": 2.1051, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.4857142857142858, |
|
"grad_norm": 0.29339122772216797, |
|
"learning_rate": 4.7614035087719296e-07, |
|
"loss": 2.1045, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.4961038961038962, |
|
"grad_norm": 0.3079680800437927, |
|
"learning_rate": 4.757894736842105e-07, |
|
"loss": 2.1066, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.5064935064935066, |
|
"grad_norm": 0.2988393008708954, |
|
"learning_rate": 4.7543859649122804e-07, |
|
"loss": 2.1134, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.516883116883117, |
|
"grad_norm": 0.3002830147743225, |
|
"learning_rate": 4.750877192982456e-07, |
|
"loss": 2.1039, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.5272727272727273, |
|
"grad_norm": 0.3064285218715668, |
|
"learning_rate": 4.747368421052632e-07, |
|
"loss": 2.1319, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.5376623376623377, |
|
"grad_norm": 0.2974233031272888, |
|
"learning_rate": 4.7438596491228066e-07, |
|
"loss": 2.1008, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.5480519480519481, |
|
"grad_norm": 0.2973909080028534, |
|
"learning_rate": 4.7403508771929826e-07, |
|
"loss": 2.1089, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.5584415584415585, |
|
"grad_norm": 0.29095181822776794, |
|
"learning_rate": 4.7368421052631574e-07, |
|
"loss": 2.0994, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.568831168831169, |
|
"grad_norm": 0.30576032400131226, |
|
"learning_rate": 4.733333333333333e-07, |
|
"loss": 2.1019, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.5792207792207793, |
|
"grad_norm": 0.31595948338508606, |
|
"learning_rate": 4.729824561403509e-07, |
|
"loss": 2.1072, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.5896103896103897, |
|
"grad_norm": 0.3032829165458679, |
|
"learning_rate": 4.7263157894736837e-07, |
|
"loss": 2.1066, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.302549809217453, |
|
"learning_rate": 4.7228070175438596e-07, |
|
"loss": 2.0903, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.6103896103896105, |
|
"grad_norm": 0.30395421385765076, |
|
"learning_rate": 4.719298245614035e-07, |
|
"loss": 2.1183, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.6207792207792209, |
|
"grad_norm": 0.30267906188964844, |
|
"learning_rate": 4.7157894736842104e-07, |
|
"loss": 2.0934, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.6311688311688313, |
|
"grad_norm": 0.30717286467552185, |
|
"learning_rate": 4.712280701754386e-07, |
|
"loss": 2.0956, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.6415584415584417, |
|
"grad_norm": 0.30463624000549316, |
|
"learning_rate": 4.7087719298245607e-07, |
|
"loss": 2.091, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.651948051948052, |
|
"grad_norm": 0.3144147992134094, |
|
"learning_rate": 4.7052631578947366e-07, |
|
"loss": 2.1109, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.6623376623376624, |
|
"grad_norm": 0.30240705609321594, |
|
"learning_rate": 4.701754385964912e-07, |
|
"loss": 2.0907, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.6727272727272728, |
|
"grad_norm": 0.2900020182132721, |
|
"learning_rate": 4.6982456140350874e-07, |
|
"loss": 2.0762, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.6831168831168832, |
|
"grad_norm": 0.2979305684566498, |
|
"learning_rate": 4.694736842105263e-07, |
|
"loss": 2.0849, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.6935064935064936, |
|
"grad_norm": 0.3039534389972687, |
|
"learning_rate": 4.691228070175439e-07, |
|
"loss": 2.0975, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.703896103896104, |
|
"grad_norm": 0.30724868178367615, |
|
"learning_rate": 4.6877192982456137e-07, |
|
"loss": 2.1221, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.3075260818004608, |
|
"learning_rate": 4.6842105263157896e-07, |
|
"loss": 2.0954, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.7246753246753248, |
|
"grad_norm": 0.3092438876628876, |
|
"learning_rate": 4.6807017543859645e-07, |
|
"loss": 2.1003, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.7350649350649352, |
|
"grad_norm": 0.3084464371204376, |
|
"learning_rate": 4.67719298245614e-07, |
|
"loss": 2.1137, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.7454545454545456, |
|
"grad_norm": 0.3065555989742279, |
|
"learning_rate": 4.673684210526316e-07, |
|
"loss": 2.1015, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.755844155844156, |
|
"grad_norm": 0.304202139377594, |
|
"learning_rate": 4.6701754385964907e-07, |
|
"loss": 2.097, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.7662337662337664, |
|
"grad_norm": 0.31230100989341736, |
|
"learning_rate": 4.6666666666666666e-07, |
|
"loss": 2.1061, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.7766233766233768, |
|
"grad_norm": 0.31154370307922363, |
|
"learning_rate": 4.6631578947368415e-07, |
|
"loss": 2.1021, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.7870129870129872, |
|
"grad_norm": 0.30575987696647644, |
|
"learning_rate": 4.6596491228070174e-07, |
|
"loss": 2.0911, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.7974025974025976, |
|
"grad_norm": 0.3098468780517578, |
|
"learning_rate": 4.656140350877193e-07, |
|
"loss": 2.0824, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.807792207792208, |
|
"grad_norm": 0.32056814432144165, |
|
"learning_rate": 4.652631578947368e-07, |
|
"loss": 2.0887, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.3210814595222473, |
|
"learning_rate": 4.6491228070175437e-07, |
|
"loss": 2.1179, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.8285714285714287, |
|
"grad_norm": 0.31061846017837524, |
|
"learning_rate": 4.645614035087719e-07, |
|
"loss": 2.0973, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.838961038961039, |
|
"grad_norm": 0.3153719902038574, |
|
"learning_rate": 4.6421052631578945e-07, |
|
"loss": 2.1132, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.8493506493506493, |
|
"grad_norm": 0.3217187821865082, |
|
"learning_rate": 4.63859649122807e-07, |
|
"loss": 2.1069, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.8597402597402597, |
|
"grad_norm": 0.31842827796936035, |
|
"learning_rate": 4.6350877192982453e-07, |
|
"loss": 2.1114, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.87012987012987, |
|
"grad_norm": 0.31468620896339417, |
|
"learning_rate": 4.6315789473684207e-07, |
|
"loss": 2.0966, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.8805194805194805, |
|
"grad_norm": 0.3157198131084442, |
|
"learning_rate": 4.6280701754385966e-07, |
|
"loss": 2.1267, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.8909090909090909, |
|
"grad_norm": 0.32008498907089233, |
|
"learning_rate": 4.6245614035087715e-07, |
|
"loss": 2.0975, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.9012987012987013, |
|
"grad_norm": 0.314317911863327, |
|
"learning_rate": 4.6210526315789475e-07, |
|
"loss": 2.0871, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.9116883116883117, |
|
"grad_norm": 0.3036201298236847, |
|
"learning_rate": 4.617543859649123e-07, |
|
"loss": 2.0865, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.922077922077922, |
|
"grad_norm": 0.3069418668746948, |
|
"learning_rate": 4.614035087719298e-07, |
|
"loss": 2.084, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.9324675324675324, |
|
"grad_norm": 0.31745216250419617, |
|
"learning_rate": 4.6105263157894737e-07, |
|
"loss": 2.0997, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.9428571428571428, |
|
"grad_norm": 0.31132858991622925, |
|
"learning_rate": 4.6070175438596486e-07, |
|
"loss": 2.0687, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.9532467532467532, |
|
"grad_norm": 0.3204294443130493, |
|
"learning_rate": 4.6035087719298245e-07, |
|
"loss": 2.1131, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.9636363636363636, |
|
"grad_norm": 0.31889617443084717, |
|
"learning_rate": 4.6e-07, |
|
"loss": 2.1159, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.974025974025974, |
|
"grad_norm": 0.3156440854072571, |
|
"learning_rate": 4.5964912280701753e-07, |
|
"loss": 2.091, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.9844155844155844, |
|
"grad_norm": 0.3086424171924591, |
|
"learning_rate": 4.5929824561403507e-07, |
|
"loss": 2.0972, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.9948051948051948, |
|
"grad_norm": 0.31145980954170227, |
|
"learning_rate": 4.5894736842105256e-07, |
|
"loss": 2.0773, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.9948051948051948, |
|
"eval_loss": 2.108880043029785, |
|
"eval_runtime": 9.2552, |
|
"eval_samples_per_second": 2.701, |
|
"eval_steps_per_second": 0.432, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.005194805194805, |
|
"grad_norm": 0.3105067014694214, |
|
"learning_rate": 4.5859649122807015e-07, |
|
"loss": 2.0807, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.0155844155844154, |
|
"grad_norm": 0.3223939538002014, |
|
"learning_rate": 4.582456140350877e-07, |
|
"loss": 2.1253, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.0259740259740258, |
|
"grad_norm": 0.3066651523113251, |
|
"learning_rate": 4.5789473684210523e-07, |
|
"loss": 2.079, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.036363636363636, |
|
"grad_norm": 0.31992414593696594, |
|
"learning_rate": 4.575438596491228e-07, |
|
"loss": 2.1183, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.0467532467532465, |
|
"grad_norm": 0.3262827396392822, |
|
"learning_rate": 4.5719298245614037e-07, |
|
"loss": 2.1356, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.057142857142857, |
|
"grad_norm": 0.31810563802719116, |
|
"learning_rate": 4.5684210526315786e-07, |
|
"loss": 2.0954, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.0675324675324673, |
|
"grad_norm": 0.3235313892364502, |
|
"learning_rate": 4.5649122807017545e-07, |
|
"loss": 2.1088, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.0779220779220777, |
|
"grad_norm": 0.3249002695083618, |
|
"learning_rate": 4.5614035087719294e-07, |
|
"loss": 2.1149, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.088311688311688, |
|
"grad_norm": 0.30616623163223267, |
|
"learning_rate": 4.557894736842105e-07, |
|
"loss": 2.0569, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.0987012987012985, |
|
"grad_norm": 0.320377916097641, |
|
"learning_rate": 4.5543859649122807e-07, |
|
"loss": 2.1122, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.109090909090909, |
|
"grad_norm": 0.3134180009365082, |
|
"learning_rate": 4.5508771929824556e-07, |
|
"loss": 2.0919, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.1194805194805193, |
|
"grad_norm": 0.32783976197242737, |
|
"learning_rate": 4.5473684210526315e-07, |
|
"loss": 2.0984, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.1298701298701297, |
|
"grad_norm": 0.32066580653190613, |
|
"learning_rate": 4.543859649122807e-07, |
|
"loss": 2.0876, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.14025974025974, |
|
"grad_norm": 0.32365262508392334, |
|
"learning_rate": 4.5403508771929823e-07, |
|
"loss": 2.1082, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.1506493506493505, |
|
"grad_norm": 0.3484063148498535, |
|
"learning_rate": 4.536842105263158e-07, |
|
"loss": 2.1351, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.161038961038961, |
|
"grad_norm": 0.3321819603443146, |
|
"learning_rate": 4.5333333333333326e-07, |
|
"loss": 2.0974, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.1714285714285713, |
|
"grad_norm": 0.32974711060523987, |
|
"learning_rate": 4.5298245614035086e-07, |
|
"loss": 2.1051, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 0.319308876991272, |
|
"learning_rate": 4.526315789473684e-07, |
|
"loss": 2.0817, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.192207792207792, |
|
"grad_norm": 0.3337661325931549, |
|
"learning_rate": 4.5228070175438594e-07, |
|
"loss": 2.1134, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.2025974025974024, |
|
"grad_norm": 0.3277778625488281, |
|
"learning_rate": 4.519298245614035e-07, |
|
"loss": 2.09, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.212987012987013, |
|
"grad_norm": 0.31506606936454773, |
|
"learning_rate": 4.5157894736842107e-07, |
|
"loss": 2.1009, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.2233766233766232, |
|
"grad_norm": 0.34575071930885315, |
|
"learning_rate": 4.5122807017543856e-07, |
|
"loss": 2.1461, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.2337662337662336, |
|
"grad_norm": 0.3281601071357727, |
|
"learning_rate": 4.5087719298245615e-07, |
|
"loss": 2.0806, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.244155844155844, |
|
"grad_norm": 0.32512006163597107, |
|
"learning_rate": 4.5052631578947364e-07, |
|
"loss": 2.1075, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.2545454545454544, |
|
"grad_norm": 0.3208228647708893, |
|
"learning_rate": 4.501754385964912e-07, |
|
"loss": 2.0935, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.264935064935065, |
|
"grad_norm": 0.33443257212638855, |
|
"learning_rate": 4.498245614035088e-07, |
|
"loss": 2.1021, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.275324675324675, |
|
"grad_norm": 0.3280114531517029, |
|
"learning_rate": 4.4947368421052626e-07, |
|
"loss": 2.1027, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 0.3224703073501587, |
|
"learning_rate": 4.4912280701754386e-07, |
|
"loss": 2.1239, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.296103896103896, |
|
"grad_norm": 0.34066352248191833, |
|
"learning_rate": 4.4877192982456135e-07, |
|
"loss": 2.0794, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.3064935064935064, |
|
"grad_norm": 0.3266121745109558, |
|
"learning_rate": 4.4842105263157894e-07, |
|
"loss": 2.115, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.3168831168831168, |
|
"grad_norm": 0.3360849618911743, |
|
"learning_rate": 4.480701754385965e-07, |
|
"loss": 2.1042, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.327272727272727, |
|
"grad_norm": 0.33068639039993286, |
|
"learning_rate": 4.47719298245614e-07, |
|
"loss": 2.1146, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.3376623376623376, |
|
"grad_norm": 0.32161960005760193, |
|
"learning_rate": 4.4736842105263156e-07, |
|
"loss": 2.0967, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.348051948051948, |
|
"grad_norm": 0.3327421545982361, |
|
"learning_rate": 4.470175438596491e-07, |
|
"loss": 2.097, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.3584415584415583, |
|
"grad_norm": 0.326388955116272, |
|
"learning_rate": 4.4666666666666664e-07, |
|
"loss": 2.0919, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.3688311688311687, |
|
"grad_norm": 0.3344230353832245, |
|
"learning_rate": 4.463157894736842e-07, |
|
"loss": 2.1358, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.379220779220779, |
|
"grad_norm": 0.33497416973114014, |
|
"learning_rate": 4.459649122807017e-07, |
|
"loss": 2.0969, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.3896103896103895, |
|
"grad_norm": 0.33529427647590637, |
|
"learning_rate": 4.4561403508771927e-07, |
|
"loss": 2.0841, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.3811129629611969, |
|
"learning_rate": 4.4526315789473686e-07, |
|
"loss": 2.097, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.4103896103896103, |
|
"grad_norm": 0.33154216408729553, |
|
"learning_rate": 4.4491228070175435e-07, |
|
"loss": 2.1155, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.4207792207792207, |
|
"grad_norm": 0.32626327872276306, |
|
"learning_rate": 4.4456140350877194e-07, |
|
"loss": 2.0988, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.431168831168831, |
|
"grad_norm": 0.33323419094085693, |
|
"learning_rate": 4.442105263157895e-07, |
|
"loss": 2.0891, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.4415584415584415, |
|
"grad_norm": 0.34010282158851624, |
|
"learning_rate": 4.4385964912280697e-07, |
|
"loss": 2.1, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.451948051948052, |
|
"grad_norm": 0.3339459002017975, |
|
"learning_rate": 4.4350877192982456e-07, |
|
"loss": 2.09, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.4623376623376623, |
|
"grad_norm": 0.33162543177604675, |
|
"learning_rate": 4.4315789473684205e-07, |
|
"loss": 2.0698, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.4727272727272727, |
|
"grad_norm": 0.3330386281013489, |
|
"learning_rate": 4.4280701754385964e-07, |
|
"loss": 2.0948, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.483116883116883, |
|
"grad_norm": 0.33064815402030945, |
|
"learning_rate": 4.424561403508772e-07, |
|
"loss": 2.0764, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.4935064935064934, |
|
"grad_norm": 0.3411155641078949, |
|
"learning_rate": 4.421052631578947e-07, |
|
"loss": 2.1208, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.503896103896104, |
|
"grad_norm": 0.3448623716831207, |
|
"learning_rate": 4.4175438596491227e-07, |
|
"loss": 2.1029, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.5142857142857142, |
|
"grad_norm": 0.3325323164463043, |
|
"learning_rate": 4.4140350877192975e-07, |
|
"loss": 2.0744, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.5246753246753246, |
|
"grad_norm": 0.3473738133907318, |
|
"learning_rate": 4.4105263157894735e-07, |
|
"loss": 2.0711, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.535064935064935, |
|
"grad_norm": 0.33442118763923645, |
|
"learning_rate": 4.407017543859649e-07, |
|
"loss": 2.0977, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 0.32157817482948303, |
|
"learning_rate": 4.4035087719298243e-07, |
|
"loss": 2.0851, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.555844155844156, |
|
"grad_norm": 0.3335091471672058, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": 2.1119, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.566233766233766, |
|
"grad_norm": 0.32816439867019653, |
|
"learning_rate": 4.3964912280701756e-07, |
|
"loss": 2.0749, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.5766233766233766, |
|
"grad_norm": 0.3414842486381531, |
|
"learning_rate": 4.3929824561403505e-07, |
|
"loss": 2.1005, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.587012987012987, |
|
"grad_norm": 0.33331358432769775, |
|
"learning_rate": 4.3894736842105264e-07, |
|
"loss": 2.0987, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.5974025974025974, |
|
"grad_norm": 0.3305921256542206, |
|
"learning_rate": 4.3859649122807013e-07, |
|
"loss": 2.1071, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.6077922077922078, |
|
"grad_norm": 0.3360355794429779, |
|
"learning_rate": 4.3824561403508767e-07, |
|
"loss": 2.0854, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.618181818181818, |
|
"grad_norm": 0.34479108452796936, |
|
"learning_rate": 4.3789473684210527e-07, |
|
"loss": 2.0948, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.6285714285714286, |
|
"grad_norm": 0.3384932577610016, |
|
"learning_rate": 4.3754385964912275e-07, |
|
"loss": 2.0911, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.638961038961039, |
|
"grad_norm": 0.3388522267341614, |
|
"learning_rate": 4.3719298245614035e-07, |
|
"loss": 2.1016, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.6493506493506493, |
|
"grad_norm": 0.3315964341163635, |
|
"learning_rate": 4.368421052631579e-07, |
|
"loss": 2.0935, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.6597402597402597, |
|
"grad_norm": 0.34984835982322693, |
|
"learning_rate": 4.3649122807017543e-07, |
|
"loss": 2.1237, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.67012987012987, |
|
"grad_norm": 0.35086217522621155, |
|
"learning_rate": 4.3614035087719297e-07, |
|
"loss": 2.1158, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.6805194805194805, |
|
"grad_norm": 0.3384174108505249, |
|
"learning_rate": 4.357894736842105e-07, |
|
"loss": 2.0945, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.690909090909091, |
|
"grad_norm": 0.33980512619018555, |
|
"learning_rate": 4.3543859649122805e-07, |
|
"loss": 2.1045, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.7012987012987013, |
|
"grad_norm": 0.3519754111766815, |
|
"learning_rate": 4.350877192982456e-07, |
|
"loss": 2.1155, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.7116883116883117, |
|
"grad_norm": 0.3461025655269623, |
|
"learning_rate": 4.3473684210526313e-07, |
|
"loss": 2.0973, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.722077922077922, |
|
"grad_norm": 0.35459649562835693, |
|
"learning_rate": 4.343859649122807e-07, |
|
"loss": 2.0845, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.7324675324675325, |
|
"grad_norm": 0.3461463451385498, |
|
"learning_rate": 4.340350877192982e-07, |
|
"loss": 2.087, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.742857142857143, |
|
"grad_norm": 0.3309321999549866, |
|
"learning_rate": 4.3368421052631576e-07, |
|
"loss": 2.1044, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.7532467532467533, |
|
"grad_norm": 0.349258154630661, |
|
"learning_rate": 4.3333333333333335e-07, |
|
"loss": 2.101, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.7636363636363637, |
|
"grad_norm": 0.3515391945838928, |
|
"learning_rate": 4.3298245614035084e-07, |
|
"loss": 2.1061, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.774025974025974, |
|
"grad_norm": 0.34546464681625366, |
|
"learning_rate": 4.326315789473684e-07, |
|
"loss": 2.112, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.7844155844155845, |
|
"grad_norm": 0.34706681966781616, |
|
"learning_rate": 4.3228070175438597e-07, |
|
"loss": 2.1201, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.794805194805195, |
|
"grad_norm": 0.34747087955474854, |
|
"learning_rate": 4.3192982456140346e-07, |
|
"loss": 2.0849, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.8051948051948052, |
|
"grad_norm": 0.34778711199760437, |
|
"learning_rate": 4.3157894736842105e-07, |
|
"loss": 2.092, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.8155844155844156, |
|
"grad_norm": 0.34066757559776306, |
|
"learning_rate": 4.3122807017543854e-07, |
|
"loss": 2.0752, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.825974025974026, |
|
"grad_norm": 0.342899352312088, |
|
"learning_rate": 4.3087719298245613e-07, |
|
"loss": 2.1058, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.8363636363636364, |
|
"grad_norm": 0.34262892603874207, |
|
"learning_rate": 4.305263157894737e-07, |
|
"loss": 2.0855, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.846753246753247, |
|
"grad_norm": 0.34279361367225647, |
|
"learning_rate": 4.301754385964912e-07, |
|
"loss": 2.0939, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.34948858618736267, |
|
"learning_rate": 4.2982456140350876e-07, |
|
"loss": 2.1111, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.8675324675324676, |
|
"grad_norm": 0.34498292207717896, |
|
"learning_rate": 4.294736842105263e-07, |
|
"loss": 2.0952, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.877922077922078, |
|
"grad_norm": 0.34956324100494385, |
|
"learning_rate": 4.2912280701754384e-07, |
|
"loss": 2.1095, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.8883116883116884, |
|
"grad_norm": 0.341450035572052, |
|
"learning_rate": 4.287719298245614e-07, |
|
"loss": 2.0893, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.898701298701299, |
|
"grad_norm": 0.34633004665374756, |
|
"learning_rate": 4.284210526315789e-07, |
|
"loss": 2.1014, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 0.35773593187332153, |
|
"learning_rate": 4.2807017543859646e-07, |
|
"loss": 2.1367, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.9194805194805196, |
|
"grad_norm": 0.34510213136672974, |
|
"learning_rate": 4.2771929824561405e-07, |
|
"loss": 2.0917, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.92987012987013, |
|
"grad_norm": 0.34937089681625366, |
|
"learning_rate": 4.2736842105263154e-07, |
|
"loss": 2.078, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.9402597402597404, |
|
"grad_norm": 0.35117366909980774, |
|
"learning_rate": 4.2701754385964913e-07, |
|
"loss": 2.0762, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.9506493506493507, |
|
"grad_norm": 0.3488243520259857, |
|
"learning_rate": 4.266666666666667e-07, |
|
"loss": 2.1195, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.961038961038961, |
|
"grad_norm": 0.3523004949092865, |
|
"learning_rate": 4.2631578947368416e-07, |
|
"loss": 2.093, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.9714285714285715, |
|
"grad_norm": 0.35649630427360535, |
|
"learning_rate": 4.2596491228070176e-07, |
|
"loss": 2.0939, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.981818181818182, |
|
"grad_norm": 0.3453286588191986, |
|
"learning_rate": 4.2561403508771924e-07, |
|
"loss": 2.077, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.9922077922077923, |
|
"grad_norm": 0.3536374270915985, |
|
"learning_rate": 4.2526315789473684e-07, |
|
"loss": 2.1049, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.9922077922077923, |
|
"eval_loss": 2.103614568710327, |
|
"eval_runtime": 9.2191, |
|
"eval_samples_per_second": 2.712, |
|
"eval_steps_per_second": 0.434, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 3.0025974025974027, |
|
"grad_norm": 0.34745439887046814, |
|
"learning_rate": 4.249122807017544e-07, |
|
"loss": 2.1012, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 3.012987012987013, |
|
"grad_norm": 0.3465849459171295, |
|
"learning_rate": 4.245614035087719e-07, |
|
"loss": 2.0759, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.0233766233766235, |
|
"grad_norm": 0.34689581394195557, |
|
"learning_rate": 4.2421052631578946e-07, |
|
"loss": 2.0887, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 3.033766233766234, |
|
"grad_norm": 0.3415219485759735, |
|
"learning_rate": 4.2385964912280695e-07, |
|
"loss": 2.0486, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 3.0441558441558443, |
|
"grad_norm": 0.34754109382629395, |
|
"learning_rate": 4.2350877192982454e-07, |
|
"loss": 2.0737, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 3.0545454545454547, |
|
"grad_norm": 0.3473778963088989, |
|
"learning_rate": 4.231578947368421e-07, |
|
"loss": 2.0727, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 3.064935064935065, |
|
"grad_norm": 0.3525889217853546, |
|
"learning_rate": 4.228070175438596e-07, |
|
"loss": 2.0836, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 3.0753246753246755, |
|
"grad_norm": 0.34054872393608093, |
|
"learning_rate": 4.2245614035087716e-07, |
|
"loss": 2.0951, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 3.085714285714286, |
|
"grad_norm": 0.35042187571525574, |
|
"learning_rate": 4.2210526315789476e-07, |
|
"loss": 2.0902, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 3.0961038961038962, |
|
"grad_norm": 0.33843863010406494, |
|
"learning_rate": 4.2175438596491225e-07, |
|
"loss": 2.1046, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 3.1064935064935066, |
|
"grad_norm": 0.36478835344314575, |
|
"learning_rate": 4.2140350877192984e-07, |
|
"loss": 2.0959, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 3.116883116883117, |
|
"grad_norm": 0.36838847398757935, |
|
"learning_rate": 4.2105263157894733e-07, |
|
"loss": 2.1017, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.1272727272727274, |
|
"grad_norm": 0.3492492735385895, |
|
"learning_rate": 4.2070175438596487e-07, |
|
"loss": 2.0701, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 3.137662337662338, |
|
"grad_norm": 0.3571326732635498, |
|
"learning_rate": 4.2035087719298246e-07, |
|
"loss": 2.1061, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 3.148051948051948, |
|
"grad_norm": 0.35111817717552185, |
|
"learning_rate": 4.1999999999999995e-07, |
|
"loss": 2.1311, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 3.1584415584415586, |
|
"grad_norm": 0.3499947786331177, |
|
"learning_rate": 4.1964912280701754e-07, |
|
"loss": 2.0856, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 3.168831168831169, |
|
"grad_norm": 0.3530219495296478, |
|
"learning_rate": 4.192982456140351e-07, |
|
"loss": 2.0947, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 3.1792207792207794, |
|
"grad_norm": 0.3510328233242035, |
|
"learning_rate": 4.189473684210526e-07, |
|
"loss": 2.1088, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 3.18961038961039, |
|
"grad_norm": 0.36004605889320374, |
|
"learning_rate": 4.1859649122807017e-07, |
|
"loss": 2.1258, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.35782137513160706, |
|
"learning_rate": 4.182456140350877e-07, |
|
"loss": 2.1167, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 3.2103896103896106, |
|
"grad_norm": 0.35306283831596375, |
|
"learning_rate": 4.1789473684210525e-07, |
|
"loss": 2.0918, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 3.220779220779221, |
|
"grad_norm": 0.35812172293663025, |
|
"learning_rate": 4.175438596491228e-07, |
|
"loss": 2.0857, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.2311688311688314, |
|
"grad_norm": 0.35852885246276855, |
|
"learning_rate": 4.1719298245614033e-07, |
|
"loss": 2.0866, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 3.2415584415584417, |
|
"grad_norm": 0.3610192537307739, |
|
"learning_rate": 4.1684210526315787e-07, |
|
"loss": 2.0741, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 3.2519480519480517, |
|
"grad_norm": 0.36115092039108276, |
|
"learning_rate": 4.164912280701754e-07, |
|
"loss": 2.0727, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 3.2623376623376625, |
|
"grad_norm": 0.35251882672309875, |
|
"learning_rate": 4.1614035087719295e-07, |
|
"loss": 2.0991, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 3.2727272727272725, |
|
"grad_norm": 0.35901498794555664, |
|
"learning_rate": 4.1578947368421054e-07, |
|
"loss": 2.1008, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 3.2831168831168833, |
|
"grad_norm": 0.3722142279148102, |
|
"learning_rate": 4.1543859649122803e-07, |
|
"loss": 2.1037, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 3.2935064935064933, |
|
"grad_norm": 0.3581855893135071, |
|
"learning_rate": 4.1508771929824557e-07, |
|
"loss": 2.0989, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 3.303896103896104, |
|
"grad_norm": 0.3623902499675751, |
|
"learning_rate": 4.1473684210526317e-07, |
|
"loss": 2.0989, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 3.314285714285714, |
|
"grad_norm": 0.3492812514305115, |
|
"learning_rate": 4.1438596491228065e-07, |
|
"loss": 2.1105, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 3.324675324675325, |
|
"grad_norm": 0.36448824405670166, |
|
"learning_rate": 4.1403508771929825e-07, |
|
"loss": 2.097, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.335064935064935, |
|
"grad_norm": 0.36863312125205994, |
|
"learning_rate": 4.1368421052631574e-07, |
|
"loss": 2.1136, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 3.3454545454545457, |
|
"grad_norm": 0.3554861545562744, |
|
"learning_rate": 4.1333333333333333e-07, |
|
"loss": 2.0715, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 3.3558441558441556, |
|
"grad_norm": 0.36161860823631287, |
|
"learning_rate": 4.1298245614035087e-07, |
|
"loss": 2.0668, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 3.3662337662337665, |
|
"grad_norm": 0.3509725332260132, |
|
"learning_rate": 4.126315789473684e-07, |
|
"loss": 2.0771, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 3.3766233766233764, |
|
"grad_norm": 0.3517126739025116, |
|
"learning_rate": 4.1228070175438595e-07, |
|
"loss": 2.0805, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.3870129870129873, |
|
"grad_norm": 0.35390713810920715, |
|
"learning_rate": 4.119298245614035e-07, |
|
"loss": 2.0988, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 3.397402597402597, |
|
"grad_norm": 0.36263352632522583, |
|
"learning_rate": 4.1157894736842103e-07, |
|
"loss": 2.0936, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 3.407792207792208, |
|
"grad_norm": 0.3613211214542389, |
|
"learning_rate": 4.1122807017543857e-07, |
|
"loss": 2.1124, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 3.418181818181818, |
|
"grad_norm": 0.3665166199207306, |
|
"learning_rate": 4.108771929824561e-07, |
|
"loss": 2.0896, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 3.4285714285714284, |
|
"grad_norm": 0.36971572041511536, |
|
"learning_rate": 4.1052631578947365e-07, |
|
"loss": 2.0941, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.4389610389610388, |
|
"grad_norm": 0.3668130338191986, |
|
"learning_rate": 4.1017543859649125e-07, |
|
"loss": 2.0919, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 3.449350649350649, |
|
"grad_norm": 0.36285045742988586, |
|
"learning_rate": 4.0982456140350874e-07, |
|
"loss": 2.0721, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 3.4597402597402596, |
|
"grad_norm": 0.3531608581542969, |
|
"learning_rate": 4.0947368421052633e-07, |
|
"loss": 2.0658, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 3.47012987012987, |
|
"grad_norm": 0.35173851251602173, |
|
"learning_rate": 4.091228070175438e-07, |
|
"loss": 2.0844, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 3.4805194805194803, |
|
"grad_norm": 0.3617774248123169, |
|
"learning_rate": 4.0877192982456136e-07, |
|
"loss": 2.0994, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.4909090909090907, |
|
"grad_norm": 0.3606766164302826, |
|
"learning_rate": 4.0842105263157895e-07, |
|
"loss": 2.1015, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 3.501298701298701, |
|
"grad_norm": 0.3587430417537689, |
|
"learning_rate": 4.0807017543859644e-07, |
|
"loss": 2.0928, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 3.5116883116883115, |
|
"grad_norm": 0.3543533682823181, |
|
"learning_rate": 4.0771929824561403e-07, |
|
"loss": 2.0947, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 3.522077922077922, |
|
"grad_norm": 0.35451260209083557, |
|
"learning_rate": 4.073684210526316e-07, |
|
"loss": 2.098, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 3.5324675324675323, |
|
"grad_norm": 0.35621732473373413, |
|
"learning_rate": 4.070175438596491e-07, |
|
"loss": 2.0926, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.5428571428571427, |
|
"grad_norm": 0.3654091954231262, |
|
"learning_rate": 4.0666666666666666e-07, |
|
"loss": 2.113, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 3.553246753246753, |
|
"grad_norm": 0.40453559160232544, |
|
"learning_rate": 4.0631578947368414e-07, |
|
"loss": 2.1097, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 3.5636363636363635, |
|
"grad_norm": 0.3696524202823639, |
|
"learning_rate": 4.0596491228070174e-07, |
|
"loss": 2.1117, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 3.574025974025974, |
|
"grad_norm": 0.36527544260025024, |
|
"learning_rate": 4.056140350877193e-07, |
|
"loss": 2.0999, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 3.5844155844155843, |
|
"grad_norm": 0.3624621033668518, |
|
"learning_rate": 4.052631578947368e-07, |
|
"loss": 2.1076, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.5948051948051947, |
|
"grad_norm": 0.3703259527683258, |
|
"learning_rate": 4.0491228070175436e-07, |
|
"loss": 2.0913, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 3.605194805194805, |
|
"grad_norm": 0.36622118949890137, |
|
"learning_rate": 4.0456140350877195e-07, |
|
"loss": 2.099, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 3.6155844155844155, |
|
"grad_norm": 0.36391839385032654, |
|
"learning_rate": 4.0421052631578944e-07, |
|
"loss": 2.0866, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 3.625974025974026, |
|
"grad_norm": 0.3735998272895813, |
|
"learning_rate": 4.0385964912280703e-07, |
|
"loss": 2.1034, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.3779623508453369, |
|
"learning_rate": 4.035087719298245e-07, |
|
"loss": 2.0702, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.6467532467532466, |
|
"grad_norm": 0.3697824776172638, |
|
"learning_rate": 4.0315789473684206e-07, |
|
"loss": 2.1214, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 3.657142857142857, |
|
"grad_norm": 0.3565923273563385, |
|
"learning_rate": 4.0280701754385966e-07, |
|
"loss": 2.0978, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 3.6675324675324674, |
|
"grad_norm": 0.37301158905029297, |
|
"learning_rate": 4.0245614035087714e-07, |
|
"loss": 2.1241, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 3.677922077922078, |
|
"grad_norm": 0.371933251619339, |
|
"learning_rate": 4.0210526315789474e-07, |
|
"loss": 2.0836, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 3.688311688311688, |
|
"grad_norm": 0.38711678981781006, |
|
"learning_rate": 4.017543859649123e-07, |
|
"loss": 2.0836, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.6987012987012986, |
|
"grad_norm": 0.37659522891044617, |
|
"learning_rate": 4.014035087719298e-07, |
|
"loss": 2.0925, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 3.709090909090909, |
|
"grad_norm": 0.37264522910118103, |
|
"learning_rate": 4.0105263157894736e-07, |
|
"loss": 2.0803, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 3.7194805194805194, |
|
"grad_norm": 0.376504510641098, |
|
"learning_rate": 4.007017543859649e-07, |
|
"loss": 2.0925, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 3.72987012987013, |
|
"grad_norm": 0.3663710653781891, |
|
"learning_rate": 4.0035087719298244e-07, |
|
"loss": 2.1138, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 3.74025974025974, |
|
"grad_norm": 0.3685579001903534, |
|
"learning_rate": 4e-07, |
|
"loss": 2.0498, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.7506493506493506, |
|
"grad_norm": 0.3644115626811981, |
|
"learning_rate": 3.996491228070175e-07, |
|
"loss": 2.1242, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 3.761038961038961, |
|
"grad_norm": 0.3706153333187103, |
|
"learning_rate": 3.9929824561403506e-07, |
|
"loss": 2.0872, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 3.7714285714285714, |
|
"grad_norm": 0.3608177900314331, |
|
"learning_rate": 3.989473684210526e-07, |
|
"loss": 2.0553, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 3.7818181818181817, |
|
"grad_norm": 0.36829179525375366, |
|
"learning_rate": 3.9859649122807014e-07, |
|
"loss": 2.1001, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.792207792207792, |
|
"grad_norm": 0.3892737925052643, |
|
"learning_rate": 3.9824561403508774e-07, |
|
"loss": 2.0975, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.8025974025974025, |
|
"grad_norm": 0.3626904785633087, |
|
"learning_rate": 3.978947368421052e-07, |
|
"loss": 2.081, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 3.812987012987013, |
|
"grad_norm": 0.382160484790802, |
|
"learning_rate": 3.9754385964912277e-07, |
|
"loss": 2.095, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 3.8233766233766233, |
|
"grad_norm": 0.3701639473438263, |
|
"learning_rate": 3.9719298245614036e-07, |
|
"loss": 2.0755, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 3.8337662337662337, |
|
"grad_norm": 0.3776034712791443, |
|
"learning_rate": 3.9684210526315785e-07, |
|
"loss": 2.1052, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 3.844155844155844, |
|
"grad_norm": 0.3688242733478546, |
|
"learning_rate": 3.9649122807017544e-07, |
|
"loss": 2.085, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.8545454545454545, |
|
"grad_norm": 0.3863198161125183, |
|
"learning_rate": 3.9614035087719293e-07, |
|
"loss": 2.1112, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 3.864935064935065, |
|
"grad_norm": 0.37687695026397705, |
|
"learning_rate": 3.957894736842105e-07, |
|
"loss": 2.102, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.8753246753246753, |
|
"grad_norm": 0.3751150071620941, |
|
"learning_rate": 3.9543859649122806e-07, |
|
"loss": 2.1052, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 3.8857142857142857, |
|
"grad_norm": 0.3695361018180847, |
|
"learning_rate": 3.950877192982456e-07, |
|
"loss": 2.0787, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.896103896103896, |
|
"grad_norm": 0.3684975802898407, |
|
"learning_rate": 3.9473684210526315e-07, |
|
"loss": 2.0708, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.9064935064935065, |
|
"grad_norm": 0.3755075931549072, |
|
"learning_rate": 3.943859649122807e-07, |
|
"loss": 2.0886, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.916883116883117, |
|
"grad_norm": 0.3694201707839966, |
|
"learning_rate": 3.9403508771929823e-07, |
|
"loss": 2.0892, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 3.9272727272727272, |
|
"grad_norm": 0.3803527057170868, |
|
"learning_rate": 3.9368421052631577e-07, |
|
"loss": 2.0866, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.9376623376623376, |
|
"grad_norm": 0.3805652856826782, |
|
"learning_rate": 3.933333333333333e-07, |
|
"loss": 2.1028, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 3.948051948051948, |
|
"grad_norm": 0.3854037821292877, |
|
"learning_rate": 3.9298245614035085e-07, |
|
"loss": 2.0945, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.9584415584415584, |
|
"grad_norm": 0.37754347920417786, |
|
"learning_rate": 3.9263157894736844e-07, |
|
"loss": 2.0963, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 3.968831168831169, |
|
"grad_norm": 0.3717576265335083, |
|
"learning_rate": 3.9228070175438593e-07, |
|
"loss": 2.1052, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 3.979220779220779, |
|
"grad_norm": 0.35753440856933594, |
|
"learning_rate": 3.919298245614035e-07, |
|
"loss": 2.0813, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 3.9896103896103896, |
|
"grad_norm": 0.3805282413959503, |
|
"learning_rate": 3.91578947368421e-07, |
|
"loss": 2.0932, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.3768204152584076, |
|
"learning_rate": 3.9122807017543855e-07, |
|
"loss": 2.1048, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.096661329269409, |
|
"eval_runtime": 9.2491, |
|
"eval_samples_per_second": 2.703, |
|
"eval_steps_per_second": 0.432, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 4.01038961038961, |
|
"grad_norm": 0.3841288685798645, |
|
"learning_rate": 3.9087719298245615e-07, |
|
"loss": 2.1132, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 4.020779220779221, |
|
"grad_norm": 0.3716009855270386, |
|
"learning_rate": 3.9052631578947363e-07, |
|
"loss": 2.1147, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 4.031168831168831, |
|
"grad_norm": 0.37986499071121216, |
|
"learning_rate": 3.9017543859649123e-07, |
|
"loss": 2.085, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 4.041558441558442, |
|
"grad_norm": 0.37856024503707886, |
|
"learning_rate": 3.8982456140350877e-07, |
|
"loss": 2.0922, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 4.0519480519480515, |
|
"grad_norm": 0.3767644464969635, |
|
"learning_rate": 3.894736842105263e-07, |
|
"loss": 2.0762, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.062337662337662, |
|
"grad_norm": 0.3725319802761078, |
|
"learning_rate": 3.8912280701754385e-07, |
|
"loss": 2.0878, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 4.072727272727272, |
|
"grad_norm": 0.35704779624938965, |
|
"learning_rate": 3.8877192982456134e-07, |
|
"loss": 2.0943, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 4.083116883116883, |
|
"grad_norm": 0.3813813626766205, |
|
"learning_rate": 3.8842105263157893e-07, |
|
"loss": 2.0769, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 4.093506493506493, |
|
"grad_norm": 0.3729550540447235, |
|
"learning_rate": 3.8807017543859647e-07, |
|
"loss": 2.0747, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 4.103896103896104, |
|
"grad_norm": 0.37814322113990784, |
|
"learning_rate": 3.87719298245614e-07, |
|
"loss": 2.0834, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 4.114285714285714, |
|
"grad_norm": 0.3847734034061432, |
|
"learning_rate": 3.8736842105263155e-07, |
|
"loss": 2.0945, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 4.124675324675325, |
|
"grad_norm": 0.3798567056655884, |
|
"learning_rate": 3.8701754385964915e-07, |
|
"loss": 2.0756, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 4.135064935064935, |
|
"grad_norm": 0.3700884282588959, |
|
"learning_rate": 3.8666666666666664e-07, |
|
"loss": 2.0621, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 4.1454545454545455, |
|
"grad_norm": 0.36993175745010376, |
|
"learning_rate": 3.8631578947368423e-07, |
|
"loss": 2.1099, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 4.1558441558441555, |
|
"grad_norm": 0.380310595035553, |
|
"learning_rate": 3.859649122807017e-07, |
|
"loss": 2.0981, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.166233766233766, |
|
"grad_norm": 0.38853439688682556, |
|
"learning_rate": 3.8561403508771926e-07, |
|
"loss": 2.1049, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 4.176623376623376, |
|
"grad_norm": 0.3776048421859741, |
|
"learning_rate": 3.8526315789473685e-07, |
|
"loss": 2.0446, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 4.187012987012987, |
|
"grad_norm": 0.38195568323135376, |
|
"learning_rate": 3.8491228070175434e-07, |
|
"loss": 2.0813, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 4.197402597402597, |
|
"grad_norm": 0.373125284910202, |
|
"learning_rate": 3.8456140350877193e-07, |
|
"loss": 2.0903, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 4.207792207792208, |
|
"grad_norm": 0.3781803548336029, |
|
"learning_rate": 3.842105263157894e-07, |
|
"loss": 2.093, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 4.218181818181818, |
|
"grad_norm": 0.38378608226776123, |
|
"learning_rate": 3.83859649122807e-07, |
|
"loss": 2.0772, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 4.228571428571429, |
|
"grad_norm": 0.3815755248069763, |
|
"learning_rate": 3.8350877192982455e-07, |
|
"loss": 2.0876, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 4.238961038961039, |
|
"grad_norm": 0.3809583783149719, |
|
"learning_rate": 3.831578947368421e-07, |
|
"loss": 2.0631, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 4.249350649350649, |
|
"grad_norm": 0.3809110224246979, |
|
"learning_rate": 3.8280701754385964e-07, |
|
"loss": 2.1069, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 4.259740259740259, |
|
"grad_norm": 0.37152138352394104, |
|
"learning_rate": 3.824561403508772e-07, |
|
"loss": 2.0738, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.27012987012987, |
|
"grad_norm": 0.3761196434497833, |
|
"learning_rate": 3.821052631578947e-07, |
|
"loss": 2.0994, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 4.28051948051948, |
|
"grad_norm": 0.39031481742858887, |
|
"learning_rate": 3.8175438596491226e-07, |
|
"loss": 2.0857, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 4.290909090909091, |
|
"grad_norm": 0.37237513065338135, |
|
"learning_rate": 3.814035087719298e-07, |
|
"loss": 2.0844, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 4.301298701298701, |
|
"grad_norm": 0.38423943519592285, |
|
"learning_rate": 3.8105263157894734e-07, |
|
"loss": 2.126, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 4.311688311688312, |
|
"grad_norm": 0.36542361974716187, |
|
"learning_rate": 3.8070175438596493e-07, |
|
"loss": 2.0705, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 4.322077922077922, |
|
"grad_norm": 0.36861154437065125, |
|
"learning_rate": 3.803508771929824e-07, |
|
"loss": 2.0681, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 4.332467532467533, |
|
"grad_norm": 0.3783316910266876, |
|
"learning_rate": 3.7999999999999996e-07, |
|
"loss": 2.0777, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 4.3428571428571425, |
|
"grad_norm": 0.38323143124580383, |
|
"learning_rate": 3.7964912280701756e-07, |
|
"loss": 2.0952, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 4.353246753246753, |
|
"grad_norm": 0.3862488269805908, |
|
"learning_rate": 3.7929824561403504e-07, |
|
"loss": 2.0927, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 0.38100945949554443, |
|
"learning_rate": 3.7894736842105264e-07, |
|
"loss": 2.1043, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.374025974025974, |
|
"grad_norm": 0.38466402888298035, |
|
"learning_rate": 3.785964912280701e-07, |
|
"loss": 2.0906, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 4.384415584415584, |
|
"grad_norm": 0.37953078746795654, |
|
"learning_rate": 3.782456140350877e-07, |
|
"loss": 2.0706, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 4.394805194805195, |
|
"grad_norm": 0.3823856711387634, |
|
"learning_rate": 3.7789473684210526e-07, |
|
"loss": 2.0951, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 4.405194805194805, |
|
"grad_norm": 0.37538549304008484, |
|
"learning_rate": 3.775438596491228e-07, |
|
"loss": 2.0771, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 4.415584415584416, |
|
"grad_norm": 0.3802937865257263, |
|
"learning_rate": 3.7719298245614034e-07, |
|
"loss": 2.078, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 4.425974025974026, |
|
"grad_norm": 0.3733079433441162, |
|
"learning_rate": 3.7684210526315783e-07, |
|
"loss": 2.0799, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 4.4363636363636365, |
|
"grad_norm": 0.37729039788246155, |
|
"learning_rate": 3.764912280701754e-07, |
|
"loss": 2.1051, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 4.4467532467532465, |
|
"grad_norm": 0.3915861248970032, |
|
"learning_rate": 3.7614035087719296e-07, |
|
"loss": 2.0927, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 4.457142857142857, |
|
"grad_norm": 0.38771378993988037, |
|
"learning_rate": 3.757894736842105e-07, |
|
"loss": 2.0989, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 4.467532467532467, |
|
"grad_norm": 0.3854687213897705, |
|
"learning_rate": 3.7543859649122804e-07, |
|
"loss": 2.0984, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 4.477922077922078, |
|
"grad_norm": 0.3793568015098572, |
|
"learning_rate": 3.7508771929824564e-07, |
|
"loss": 2.0804, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 4.488311688311688, |
|
"grad_norm": 0.39430853724479675, |
|
"learning_rate": 3.747368421052631e-07, |
|
"loss": 2.0985, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 4.498701298701299, |
|
"grad_norm": 0.3847366273403168, |
|
"learning_rate": 3.743859649122807e-07, |
|
"loss": 2.0849, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 4.509090909090909, |
|
"grad_norm": 0.374398797750473, |
|
"learning_rate": 3.740350877192982e-07, |
|
"loss": 2.0847, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 4.51948051948052, |
|
"grad_norm": 0.4258849620819092, |
|
"learning_rate": 3.7368421052631575e-07, |
|
"loss": 2.082, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.52987012987013, |
|
"grad_norm": 0.3853350579738617, |
|
"learning_rate": 3.7333333333333334e-07, |
|
"loss": 2.0832, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 4.54025974025974, |
|
"grad_norm": 0.38020631670951843, |
|
"learning_rate": 3.7298245614035083e-07, |
|
"loss": 2.0799, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 4.55064935064935, |
|
"grad_norm": 0.4022679030895233, |
|
"learning_rate": 3.726315789473684e-07, |
|
"loss": 2.1038, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 4.561038961038961, |
|
"grad_norm": 0.37137728929519653, |
|
"learning_rate": 3.7228070175438596e-07, |
|
"loss": 2.0921, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 4.571428571428571, |
|
"grad_norm": 0.38251206278800964, |
|
"learning_rate": 3.719298245614035e-07, |
|
"loss": 2.0878, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.581818181818182, |
|
"grad_norm": 0.39200717210769653, |
|
"learning_rate": 3.7157894736842104e-07, |
|
"loss": 2.1202, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 4.592207792207792, |
|
"grad_norm": 0.3731335699558258, |
|
"learning_rate": 3.7122807017543853e-07, |
|
"loss": 2.0737, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 4.602597402597403, |
|
"grad_norm": 0.38276833295822144, |
|
"learning_rate": 3.708771929824561e-07, |
|
"loss": 2.0521, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 4.612987012987013, |
|
"grad_norm": 0.38775137066841125, |
|
"learning_rate": 3.7052631578947367e-07, |
|
"loss": 2.0772, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 4.623376623376624, |
|
"grad_norm": 0.3836955428123474, |
|
"learning_rate": 3.701754385964912e-07, |
|
"loss": 2.0992, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.6337662337662335, |
|
"grad_norm": 0.37715139985084534, |
|
"learning_rate": 3.6982456140350875e-07, |
|
"loss": 2.0499, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 4.644155844155844, |
|
"grad_norm": 0.3789008557796478, |
|
"learning_rate": 3.6947368421052634e-07, |
|
"loss": 2.0531, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 4.654545454545454, |
|
"grad_norm": 0.3865036964416504, |
|
"learning_rate": 3.6912280701754383e-07, |
|
"loss": 2.0949, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 4.664935064935065, |
|
"grad_norm": 0.3880210816860199, |
|
"learning_rate": 3.687719298245614e-07, |
|
"loss": 2.0871, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 4.675324675324675, |
|
"grad_norm": 0.3839876353740692, |
|
"learning_rate": 3.684210526315789e-07, |
|
"loss": 2.0586, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.685714285714286, |
|
"grad_norm": 0.39316463470458984, |
|
"learning_rate": 3.6807017543859645e-07, |
|
"loss": 2.0736, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 4.696103896103896, |
|
"grad_norm": 0.37328803539276123, |
|
"learning_rate": 3.6771929824561405e-07, |
|
"loss": 2.084, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 4.706493506493507, |
|
"grad_norm": 0.3884430527687073, |
|
"learning_rate": 3.6736842105263153e-07, |
|
"loss": 2.0788, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 4.716883116883117, |
|
"grad_norm": 0.385623574256897, |
|
"learning_rate": 3.6701754385964913e-07, |
|
"loss": 2.0705, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 4.7272727272727275, |
|
"grad_norm": 0.38950812816619873, |
|
"learning_rate": 3.666666666666666e-07, |
|
"loss": 2.0785, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.7376623376623375, |
|
"grad_norm": 0.38535040616989136, |
|
"learning_rate": 3.663157894736842e-07, |
|
"loss": 2.0909, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 4.748051948051948, |
|
"grad_norm": 0.3869593143463135, |
|
"learning_rate": 3.6596491228070175e-07, |
|
"loss": 2.0801, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 4.758441558441558, |
|
"grad_norm": 0.39084428548812866, |
|
"learning_rate": 3.656140350877193e-07, |
|
"loss": 2.096, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 4.768831168831169, |
|
"grad_norm": 0.3794546127319336, |
|
"learning_rate": 3.6526315789473683e-07, |
|
"loss": 2.0527, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 4.779220779220779, |
|
"grad_norm": 0.3870809078216553, |
|
"learning_rate": 3.6491228070175437e-07, |
|
"loss": 2.0853, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.78961038961039, |
|
"grad_norm": 0.38205036520957947, |
|
"learning_rate": 3.645614035087719e-07, |
|
"loss": 2.0643, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.3907061815261841, |
|
"learning_rate": 3.6421052631578945e-07, |
|
"loss": 2.0786, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 4.810389610389611, |
|
"grad_norm": 0.39493080973625183, |
|
"learning_rate": 3.63859649122807e-07, |
|
"loss": 2.0944, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 4.820779220779221, |
|
"grad_norm": 0.3930380046367645, |
|
"learning_rate": 3.6350877192982453e-07, |
|
"loss": 2.1138, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 4.8311688311688314, |
|
"grad_norm": 0.3952060639858246, |
|
"learning_rate": 3.6315789473684213e-07, |
|
"loss": 2.0802, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.841558441558441, |
|
"grad_norm": 0.3815995752811432, |
|
"learning_rate": 3.628070175438596e-07, |
|
"loss": 2.0838, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 4.851948051948052, |
|
"grad_norm": 0.38858020305633545, |
|
"learning_rate": 3.6245614035087716e-07, |
|
"loss": 2.0804, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 4.862337662337662, |
|
"grad_norm": 0.385565847158432, |
|
"learning_rate": 3.6210526315789475e-07, |
|
"loss": 2.0974, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 4.872727272727273, |
|
"grad_norm": 0.3909178078174591, |
|
"learning_rate": 3.6175438596491224e-07, |
|
"loss": 2.0887, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 4.883116883116883, |
|
"grad_norm": 0.3982325792312622, |
|
"learning_rate": 3.6140350877192983e-07, |
|
"loss": 2.1054, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.893506493506494, |
|
"grad_norm": 0.3876339793205261, |
|
"learning_rate": 3.610526315789473e-07, |
|
"loss": 2.1054, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 4.903896103896104, |
|
"grad_norm": 0.3819069266319275, |
|
"learning_rate": 3.607017543859649e-07, |
|
"loss": 2.0821, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 4.914285714285715, |
|
"grad_norm": 0.3924694359302521, |
|
"learning_rate": 3.6035087719298245e-07, |
|
"loss": 2.0712, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 4.9246753246753245, |
|
"grad_norm": 0.3937675654888153, |
|
"learning_rate": 3.6e-07, |
|
"loss": 2.1057, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 4.935064935064935, |
|
"grad_norm": 0.38620275259017944, |
|
"learning_rate": 3.5964912280701754e-07, |
|
"loss": 2.0845, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.945454545454545, |
|
"grad_norm": 0.40442150831222534, |
|
"learning_rate": 3.59298245614035e-07, |
|
"loss": 2.0936, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 4.955844155844156, |
|
"grad_norm": 0.3815317153930664, |
|
"learning_rate": 3.589473684210526e-07, |
|
"loss": 2.0845, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 4.966233766233766, |
|
"grad_norm": 0.38584476709365845, |
|
"learning_rate": 3.5859649122807016e-07, |
|
"loss": 2.071, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 4.976623376623377, |
|
"grad_norm": 0.3887505829334259, |
|
"learning_rate": 3.582456140350877e-07, |
|
"loss": 2.0924, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 4.987012987012987, |
|
"grad_norm": 0.3836219012737274, |
|
"learning_rate": 3.5789473684210524e-07, |
|
"loss": 2.0986, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.997402597402598, |
|
"grad_norm": 0.38430362939834595, |
|
"learning_rate": 3.5754385964912283e-07, |
|
"loss": 2.1006, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 4.997402597402598, |
|
"eval_loss": 2.0890417098999023, |
|
"eval_runtime": 9.2277, |
|
"eval_samples_per_second": 2.709, |
|
"eval_steps_per_second": 0.433, |
|
"step": 481 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 16, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.690178034414387e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|