ssunggun2's picture
Initial commit of fine-tuned model
2837dd7 verified
{
"best_metric": 2.0890417098999023,
"best_model_checkpoint": "/data/sunggeunan/ICL/src/outputs/Meta-Llama-3-8B-Instruct_qa_ft_QA_mrqa_nq_SQuAD_3shot_1docs/checkpoint-481",
"epoch": 4.997402597402598,
"eval_steps": 100,
"global_step": 481,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01038961038961039,
"grad_norm": 0.25633782148361206,
"learning_rate": 6.666666666666667e-09,
"loss": 2.1094,
"step": 1
},
{
"epoch": 0.02077922077922078,
"grad_norm": 0.26136407256126404,
"learning_rate": 1.3333333333333334e-08,
"loss": 2.1062,
"step": 2
},
{
"epoch": 0.03116883116883117,
"grad_norm": 0.25959405303001404,
"learning_rate": 2e-08,
"loss": 2.1249,
"step": 3
},
{
"epoch": 0.04155844155844156,
"grad_norm": 0.2641236484050751,
"learning_rate": 2.6666666666666667e-08,
"loss": 2.1248,
"step": 4
},
{
"epoch": 0.05194805194805195,
"grad_norm": 0.2523995637893677,
"learning_rate": 3.3333333333333334e-08,
"loss": 2.1085,
"step": 5
},
{
"epoch": 0.06233766233766234,
"grad_norm": 0.25376567244529724,
"learning_rate": 4e-08,
"loss": 2.0999,
"step": 6
},
{
"epoch": 0.07272727272727272,
"grad_norm": 0.2529865801334381,
"learning_rate": 4.666666666666667e-08,
"loss": 2.1005,
"step": 7
},
{
"epoch": 0.08311688311688312,
"grad_norm": 0.26591023802757263,
"learning_rate": 5.3333333333333334e-08,
"loss": 2.0975,
"step": 8
},
{
"epoch": 0.09350649350649351,
"grad_norm": 0.2609612047672272,
"learning_rate": 6e-08,
"loss": 2.1294,
"step": 9
},
{
"epoch": 0.1038961038961039,
"grad_norm": 0.260165274143219,
"learning_rate": 6.666666666666667e-08,
"loss": 2.0961,
"step": 10
},
{
"epoch": 0.11428571428571428,
"grad_norm": 0.2504313886165619,
"learning_rate": 7.333333333333333e-08,
"loss": 2.0863,
"step": 11
},
{
"epoch": 0.12467532467532468,
"grad_norm": 0.2633957266807556,
"learning_rate": 8e-08,
"loss": 2.0822,
"step": 12
},
{
"epoch": 0.13506493506493505,
"grad_norm": 0.2547496557235718,
"learning_rate": 8.666666666666666e-08,
"loss": 2.1066,
"step": 13
},
{
"epoch": 0.14545454545454545,
"grad_norm": 0.27546900510787964,
"learning_rate": 9.333333333333334e-08,
"loss": 2.1399,
"step": 14
},
{
"epoch": 0.15584415584415584,
"grad_norm": 0.25894761085510254,
"learning_rate": 1e-07,
"loss": 2.1071,
"step": 15
},
{
"epoch": 0.16623376623376623,
"grad_norm": 0.2625376284122467,
"learning_rate": 1.0666666666666667e-07,
"loss": 2.1171,
"step": 16
},
{
"epoch": 0.17662337662337663,
"grad_norm": 0.25433292984962463,
"learning_rate": 1.1333333333333332e-07,
"loss": 2.0932,
"step": 17
},
{
"epoch": 0.18701298701298702,
"grad_norm": 0.26560184359550476,
"learning_rate": 1.2e-07,
"loss": 2.0945,
"step": 18
},
{
"epoch": 0.1974025974025974,
"grad_norm": 0.26605260372161865,
"learning_rate": 1.2666666666666666e-07,
"loss": 2.1034,
"step": 19
},
{
"epoch": 0.2077922077922078,
"grad_norm": 0.2608611583709717,
"learning_rate": 1.3333333333333334e-07,
"loss": 2.0955,
"step": 20
},
{
"epoch": 0.21818181818181817,
"grad_norm": 0.2744869589805603,
"learning_rate": 1.4e-07,
"loss": 2.0934,
"step": 21
},
{
"epoch": 0.22857142857142856,
"grad_norm": 0.2602550685405731,
"learning_rate": 1.4666666666666666e-07,
"loss": 2.0788,
"step": 22
},
{
"epoch": 0.23896103896103896,
"grad_norm": 0.2581612467765808,
"learning_rate": 1.533333333333333e-07,
"loss": 2.1258,
"step": 23
},
{
"epoch": 0.24935064935064935,
"grad_norm": 0.25001809000968933,
"learning_rate": 1.6e-07,
"loss": 2.1034,
"step": 24
},
{
"epoch": 0.2597402597402597,
"grad_norm": 0.2558351457118988,
"learning_rate": 1.6666666666666665e-07,
"loss": 2.1111,
"step": 25
},
{
"epoch": 0.2701298701298701,
"grad_norm": 0.26691997051239014,
"learning_rate": 1.7333333333333332e-07,
"loss": 2.1056,
"step": 26
},
{
"epoch": 0.2805194805194805,
"grad_norm": 0.25776407122612,
"learning_rate": 1.8e-07,
"loss": 2.0942,
"step": 27
},
{
"epoch": 0.2909090909090909,
"grad_norm": 0.2654891610145569,
"learning_rate": 1.8666666666666667e-07,
"loss": 2.0946,
"step": 28
},
{
"epoch": 0.3012987012987013,
"grad_norm": 0.2603527009487152,
"learning_rate": 1.9333333333333332e-07,
"loss": 2.1056,
"step": 29
},
{
"epoch": 0.3116883116883117,
"grad_norm": 0.2545248568058014,
"learning_rate": 2e-07,
"loss": 2.1149,
"step": 30
},
{
"epoch": 0.3220779220779221,
"grad_norm": 0.26618441939353943,
"learning_rate": 2.0666666666666666e-07,
"loss": 2.1127,
"step": 31
},
{
"epoch": 0.33246753246753247,
"grad_norm": 0.26514533162117004,
"learning_rate": 2.1333333333333334e-07,
"loss": 2.096,
"step": 32
},
{
"epoch": 0.34285714285714286,
"grad_norm": 0.2551611065864563,
"learning_rate": 2.1999999999999998e-07,
"loss": 2.1051,
"step": 33
},
{
"epoch": 0.35324675324675325,
"grad_norm": 0.26668792963027954,
"learning_rate": 2.2666666666666663e-07,
"loss": 2.1081,
"step": 34
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.24424628913402557,
"learning_rate": 2.3333333333333333e-07,
"loss": 2.0856,
"step": 35
},
{
"epoch": 0.37402597402597404,
"grad_norm": 0.26595574617385864,
"learning_rate": 2.4e-07,
"loss": 2.1143,
"step": 36
},
{
"epoch": 0.38441558441558443,
"grad_norm": 0.26116102933883667,
"learning_rate": 2.4666666666666665e-07,
"loss": 2.0815,
"step": 37
},
{
"epoch": 0.3948051948051948,
"grad_norm": 0.2586928904056549,
"learning_rate": 2.533333333333333e-07,
"loss": 2.1031,
"step": 38
},
{
"epoch": 0.4051948051948052,
"grad_norm": 0.2762044668197632,
"learning_rate": 2.6e-07,
"loss": 2.116,
"step": 39
},
{
"epoch": 0.4155844155844156,
"grad_norm": 0.2627628445625305,
"learning_rate": 2.6666666666666667e-07,
"loss": 2.1134,
"step": 40
},
{
"epoch": 0.42597402597402595,
"grad_norm": 0.26995256543159485,
"learning_rate": 2.733333333333333e-07,
"loss": 2.0891,
"step": 41
},
{
"epoch": 0.43636363636363634,
"grad_norm": 0.25931429862976074,
"learning_rate": 2.8e-07,
"loss": 2.1116,
"step": 42
},
{
"epoch": 0.44675324675324674,
"grad_norm": 0.25965389609336853,
"learning_rate": 2.866666666666667e-07,
"loss": 2.0947,
"step": 43
},
{
"epoch": 0.45714285714285713,
"grad_norm": 0.2590876519680023,
"learning_rate": 2.933333333333333e-07,
"loss": 2.1066,
"step": 44
},
{
"epoch": 0.4675324675324675,
"grad_norm": 0.2535057067871094,
"learning_rate": 3e-07,
"loss": 2.0889,
"step": 45
},
{
"epoch": 0.4779220779220779,
"grad_norm": 0.26631295680999756,
"learning_rate": 3.066666666666666e-07,
"loss": 2.0938,
"step": 46
},
{
"epoch": 0.4883116883116883,
"grad_norm": 0.2609468102455139,
"learning_rate": 3.1333333333333333e-07,
"loss": 2.1034,
"step": 47
},
{
"epoch": 0.4987012987012987,
"grad_norm": 0.2554691731929779,
"learning_rate": 3.2e-07,
"loss": 2.1341,
"step": 48
},
{
"epoch": 0.509090909090909,
"grad_norm": 0.27121829986572266,
"learning_rate": 3.2666666666666663e-07,
"loss": 2.1238,
"step": 49
},
{
"epoch": 0.5194805194805194,
"grad_norm": 0.28885209560394287,
"learning_rate": 3.333333333333333e-07,
"loss": 2.0966,
"step": 50
},
{
"epoch": 0.5298701298701298,
"grad_norm": 0.28006577491760254,
"learning_rate": 3.4000000000000003e-07,
"loss": 2.1542,
"step": 51
},
{
"epoch": 0.5402597402597402,
"grad_norm": 0.26597273349761963,
"learning_rate": 3.4666666666666665e-07,
"loss": 2.1042,
"step": 52
},
{
"epoch": 0.5506493506493506,
"grad_norm": 0.2693743109703064,
"learning_rate": 3.533333333333333e-07,
"loss": 2.1125,
"step": 53
},
{
"epoch": 0.561038961038961,
"grad_norm": 0.25912410020828247,
"learning_rate": 3.6e-07,
"loss": 2.0925,
"step": 54
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.27581265568733215,
"learning_rate": 3.666666666666666e-07,
"loss": 2.0952,
"step": 55
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.271810382604599,
"learning_rate": 3.7333333333333334e-07,
"loss": 2.1108,
"step": 56
},
{
"epoch": 0.5922077922077922,
"grad_norm": 0.26822298765182495,
"learning_rate": 3.7999999999999996e-07,
"loss": 2.1143,
"step": 57
},
{
"epoch": 0.6025974025974026,
"grad_norm": 0.27131083607673645,
"learning_rate": 3.8666666666666664e-07,
"loss": 2.1061,
"step": 58
},
{
"epoch": 0.612987012987013,
"grad_norm": 0.2661900520324707,
"learning_rate": 3.933333333333333e-07,
"loss": 2.1068,
"step": 59
},
{
"epoch": 0.6233766233766234,
"grad_norm": 0.2700493335723877,
"learning_rate": 4e-07,
"loss": 2.1428,
"step": 60
},
{
"epoch": 0.6337662337662338,
"grad_norm": 0.2725152373313904,
"learning_rate": 4.0666666666666666e-07,
"loss": 2.1086,
"step": 61
},
{
"epoch": 0.6441558441558441,
"grad_norm": 0.27668678760528564,
"learning_rate": 4.1333333333333333e-07,
"loss": 2.1074,
"step": 62
},
{
"epoch": 0.6545454545454545,
"grad_norm": 0.2740931212902069,
"learning_rate": 4.1999999999999995e-07,
"loss": 2.0993,
"step": 63
},
{
"epoch": 0.6649350649350649,
"grad_norm": 0.26940861344337463,
"learning_rate": 4.266666666666667e-07,
"loss": 2.0949,
"step": 64
},
{
"epoch": 0.6753246753246753,
"grad_norm": 0.2628956139087677,
"learning_rate": 4.3333333333333335e-07,
"loss": 2.1142,
"step": 65
},
{
"epoch": 0.6857142857142857,
"grad_norm": 0.25989633798599243,
"learning_rate": 4.3999999999999997e-07,
"loss": 2.0982,
"step": 66
},
{
"epoch": 0.6961038961038961,
"grad_norm": 0.26778894662857056,
"learning_rate": 4.4666666666666664e-07,
"loss": 2.127,
"step": 67
},
{
"epoch": 0.7064935064935065,
"grad_norm": 0.27109193801879883,
"learning_rate": 4.5333333333333326e-07,
"loss": 2.1218,
"step": 68
},
{
"epoch": 0.7168831168831169,
"grad_norm": 0.2669210433959961,
"learning_rate": 4.6e-07,
"loss": 2.1008,
"step": 69
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.2724508047103882,
"learning_rate": 4.6666666666666666e-07,
"loss": 2.1101,
"step": 70
},
{
"epoch": 0.7376623376623377,
"grad_norm": 0.2761273980140686,
"learning_rate": 4.733333333333333e-07,
"loss": 2.0829,
"step": 71
},
{
"epoch": 0.7480519480519481,
"grad_norm": 0.27279791235923767,
"learning_rate": 4.8e-07,
"loss": 2.1186,
"step": 72
},
{
"epoch": 0.7584415584415585,
"grad_norm": 0.2563924193382263,
"learning_rate": 4.866666666666666e-07,
"loss": 2.1007,
"step": 73
},
{
"epoch": 0.7688311688311689,
"grad_norm": 0.271611750125885,
"learning_rate": 4.933333333333333e-07,
"loss": 2.0846,
"step": 74
},
{
"epoch": 0.7792207792207793,
"grad_norm": 0.2771332561969757,
"learning_rate": 5e-07,
"loss": 2.0993,
"step": 75
},
{
"epoch": 0.7896103896103897,
"grad_norm": 0.2688583433628082,
"learning_rate": 4.996491228070176e-07,
"loss": 2.0966,
"step": 76
},
{
"epoch": 0.8,
"grad_norm": 0.27472490072250366,
"learning_rate": 4.992982456140351e-07,
"loss": 2.0898,
"step": 77
},
{
"epoch": 0.8103896103896104,
"grad_norm": 0.2720624506473541,
"learning_rate": 4.989473684210527e-07,
"loss": 2.1063,
"step": 78
},
{
"epoch": 0.8207792207792208,
"grad_norm": 0.2789762020111084,
"learning_rate": 4.985964912280701e-07,
"loss": 2.1068,
"step": 79
},
{
"epoch": 0.8311688311688312,
"grad_norm": 0.2771006226539612,
"learning_rate": 4.982456140350877e-07,
"loss": 2.0926,
"step": 80
},
{
"epoch": 0.8415584415584415,
"grad_norm": 0.2923927307128906,
"learning_rate": 4.978947368421052e-07,
"loss": 2.109,
"step": 81
},
{
"epoch": 0.8519480519480519,
"grad_norm": 0.2825423777103424,
"learning_rate": 4.975438596491228e-07,
"loss": 2.1115,
"step": 82
},
{
"epoch": 0.8623376623376623,
"grad_norm": 0.28421661257743835,
"learning_rate": 4.971929824561403e-07,
"loss": 2.1057,
"step": 83
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.31991952657699585,
"learning_rate": 4.968421052631579e-07,
"loss": 2.0972,
"step": 84
},
{
"epoch": 0.8831168831168831,
"grad_norm": 0.28094005584716797,
"learning_rate": 4.964912280701754e-07,
"loss": 2.1209,
"step": 85
},
{
"epoch": 0.8935064935064935,
"grad_norm": 0.2852063775062561,
"learning_rate": 4.96140350877193e-07,
"loss": 2.1166,
"step": 86
},
{
"epoch": 0.9038961038961039,
"grad_norm": 0.27665138244628906,
"learning_rate": 4.957894736842105e-07,
"loss": 2.1284,
"step": 87
},
{
"epoch": 0.9142857142857143,
"grad_norm": 0.2683943808078766,
"learning_rate": 4.954385964912281e-07,
"loss": 2.1049,
"step": 88
},
{
"epoch": 0.9246753246753247,
"grad_norm": 0.27968302369117737,
"learning_rate": 4.950877192982457e-07,
"loss": 2.1182,
"step": 89
},
{
"epoch": 0.935064935064935,
"grad_norm": 0.27822941541671753,
"learning_rate": 4.947368421052631e-07,
"loss": 2.1164,
"step": 90
},
{
"epoch": 0.9454545454545454,
"grad_norm": 0.28795087337493896,
"learning_rate": 4.943859649122807e-07,
"loss": 2.1145,
"step": 91
},
{
"epoch": 0.9558441558441558,
"grad_norm": 0.2840701639652252,
"learning_rate": 4.940350877192982e-07,
"loss": 2.1172,
"step": 92
},
{
"epoch": 0.9662337662337662,
"grad_norm": 0.2897505462169647,
"learning_rate": 4.936842105263157e-07,
"loss": 2.0802,
"step": 93
},
{
"epoch": 0.9766233766233766,
"grad_norm": 0.2831282615661621,
"learning_rate": 4.933333333333333e-07,
"loss": 2.0798,
"step": 94
},
{
"epoch": 0.987012987012987,
"grad_norm": 0.2663356363773346,
"learning_rate": 4.929824561403508e-07,
"loss": 2.0551,
"step": 95
},
{
"epoch": 0.9974025974025974,
"grad_norm": 0.2793254852294922,
"learning_rate": 4.926315789473684e-07,
"loss": 2.1085,
"step": 96
},
{
"epoch": 0.9974025974025974,
"eval_loss": 2.1119654178619385,
"eval_runtime": 9.2468,
"eval_samples_per_second": 2.704,
"eval_steps_per_second": 0.433,
"step": 96
},
{
"epoch": 1.0077922077922077,
"grad_norm": 0.2765531837940216,
"learning_rate": 4.92280701754386e-07,
"loss": 2.1192,
"step": 97
},
{
"epoch": 1.018181818181818,
"grad_norm": 0.29020991921424866,
"learning_rate": 4.919298245614035e-07,
"loss": 2.1216,
"step": 98
},
{
"epoch": 1.0285714285714285,
"grad_norm": 0.2918996512889862,
"learning_rate": 4.915789473684211e-07,
"loss": 2.1035,
"step": 99
},
{
"epoch": 1.0389610389610389,
"grad_norm": 0.2785792350769043,
"learning_rate": 4.912280701754385e-07,
"loss": 2.1132,
"step": 100
},
{
"epoch": 1.0493506493506493,
"grad_norm": 0.27520841360092163,
"learning_rate": 4.908771929824561e-07,
"loss": 2.0914,
"step": 101
},
{
"epoch": 1.0597402597402596,
"grad_norm": 0.2906198799610138,
"learning_rate": 4.905263157894736e-07,
"loss": 2.1184,
"step": 102
},
{
"epoch": 1.07012987012987,
"grad_norm": 0.29240748286247253,
"learning_rate": 4.901754385964912e-07,
"loss": 2.1002,
"step": 103
},
{
"epoch": 1.0805194805194804,
"grad_norm": 0.2815570533275604,
"learning_rate": 4.898245614035087e-07,
"loss": 2.1022,
"step": 104
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.2839779853820801,
"learning_rate": 4.894736842105263e-07,
"loss": 2.0933,
"step": 105
},
{
"epoch": 1.1012987012987012,
"grad_norm": 0.28621962666511536,
"learning_rate": 4.891228070175438e-07,
"loss": 2.0967,
"step": 106
},
{
"epoch": 1.1116883116883116,
"grad_norm": 0.28691983222961426,
"learning_rate": 4.887719298245614e-07,
"loss": 2.09,
"step": 107
},
{
"epoch": 1.122077922077922,
"grad_norm": 0.28074637055397034,
"learning_rate": 4.884210526315789e-07,
"loss": 2.0933,
"step": 108
},
{
"epoch": 1.1324675324675324,
"grad_norm": 0.27770230174064636,
"learning_rate": 4.880701754385965e-07,
"loss": 2.0589,
"step": 109
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.30154216289520264,
"learning_rate": 4.877192982456141e-07,
"loss": 2.1361,
"step": 110
},
{
"epoch": 1.1532467532467532,
"grad_norm": 0.29188480973243713,
"learning_rate": 4.873684210526315e-07,
"loss": 2.1015,
"step": 111
},
{
"epoch": 1.1636363636363636,
"grad_norm": 0.2855170965194702,
"learning_rate": 4.870175438596491e-07,
"loss": 2.1281,
"step": 112
},
{
"epoch": 1.174025974025974,
"grad_norm": 0.2731803059577942,
"learning_rate": 4.866666666666666e-07,
"loss": 2.108,
"step": 113
},
{
"epoch": 1.1844155844155844,
"grad_norm": 0.29748019576072693,
"learning_rate": 4.863157894736842e-07,
"loss": 2.1409,
"step": 114
},
{
"epoch": 1.1948051948051948,
"grad_norm": 0.3487149178981781,
"learning_rate": 4.859649122807017e-07,
"loss": 2.1189,
"step": 115
},
{
"epoch": 1.2051948051948052,
"grad_norm": 0.2950851023197174,
"learning_rate": 4.856140350877193e-07,
"loss": 2.1297,
"step": 116
},
{
"epoch": 1.2155844155844155,
"grad_norm": 0.28961101174354553,
"learning_rate": 4.852631578947368e-07,
"loss": 2.1161,
"step": 117
},
{
"epoch": 1.225974025974026,
"grad_norm": 0.2877264618873596,
"learning_rate": 4.849122807017544e-07,
"loss": 2.0852,
"step": 118
},
{
"epoch": 1.2363636363636363,
"grad_norm": 0.29030388593673706,
"learning_rate": 4.845614035087719e-07,
"loss": 2.1115,
"step": 119
},
{
"epoch": 1.2467532467532467,
"grad_norm": 0.2916348874568939,
"learning_rate": 4.842105263157895e-07,
"loss": 2.0792,
"step": 120
},
{
"epoch": 1.2571428571428571,
"grad_norm": 0.2899838984012604,
"learning_rate": 4.838596491228071e-07,
"loss": 2.118,
"step": 121
},
{
"epoch": 1.2675324675324675,
"grad_norm": 0.28898170590400696,
"learning_rate": 4.835087719298245e-07,
"loss": 2.0958,
"step": 122
},
{
"epoch": 1.277922077922078,
"grad_norm": 0.29052966833114624,
"learning_rate": 4.831578947368421e-07,
"loss": 2.0883,
"step": 123
},
{
"epoch": 1.2883116883116883,
"grad_norm": 0.29824158549308777,
"learning_rate": 4.828070175438596e-07,
"loss": 2.1201,
"step": 124
},
{
"epoch": 1.2987012987012987,
"grad_norm": 0.2876146733760834,
"learning_rate": 4.824561403508772e-07,
"loss": 2.0847,
"step": 125
},
{
"epoch": 1.309090909090909,
"grad_norm": 0.29399487376213074,
"learning_rate": 4.821052631578947e-07,
"loss": 2.124,
"step": 126
},
{
"epoch": 1.3194805194805195,
"grad_norm": 0.28400611877441406,
"learning_rate": 4.817543859649122e-07,
"loss": 2.1024,
"step": 127
},
{
"epoch": 1.3298701298701299,
"grad_norm": 0.2978748083114624,
"learning_rate": 4.814035087719298e-07,
"loss": 2.1282,
"step": 128
},
{
"epoch": 1.3402597402597403,
"grad_norm": 0.30035582184791565,
"learning_rate": 4.810526315789473e-07,
"loss": 2.103,
"step": 129
},
{
"epoch": 1.3506493506493507,
"grad_norm": 0.29803967475891113,
"learning_rate": 4.807017543859649e-07,
"loss": 2.075,
"step": 130
},
{
"epoch": 1.361038961038961,
"grad_norm": 0.2902166247367859,
"learning_rate": 4.803508771929825e-07,
"loss": 2.1254,
"step": 131
},
{
"epoch": 1.3714285714285714,
"grad_norm": 0.3193589448928833,
"learning_rate": 4.8e-07,
"loss": 2.1075,
"step": 132
},
{
"epoch": 1.3818181818181818,
"grad_norm": 0.2991260290145874,
"learning_rate": 4.796491228070176e-07,
"loss": 2.101,
"step": 133
},
{
"epoch": 1.3922077922077922,
"grad_norm": 0.2965753376483917,
"learning_rate": 4.79298245614035e-07,
"loss": 2.1326,
"step": 134
},
{
"epoch": 1.4025974025974026,
"grad_norm": 0.29871872067451477,
"learning_rate": 4.789473684210526e-07,
"loss": 2.1365,
"step": 135
},
{
"epoch": 1.412987012987013,
"grad_norm": 0.3004847764968872,
"learning_rate": 4.785964912280701e-07,
"loss": 2.0904,
"step": 136
},
{
"epoch": 1.4233766233766234,
"grad_norm": 0.3125920295715332,
"learning_rate": 4.782456140350877e-07,
"loss": 2.1208,
"step": 137
},
{
"epoch": 1.4337662337662338,
"grad_norm": 0.2945519685745239,
"learning_rate": 4.778947368421052e-07,
"loss": 2.1064,
"step": 138
},
{
"epoch": 1.4441558441558442,
"grad_norm": 0.2995961308479309,
"learning_rate": 4.775438596491228e-07,
"loss": 2.1184,
"step": 139
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.30309098958969116,
"learning_rate": 4.771929824561403e-07,
"loss": 2.0869,
"step": 140
},
{
"epoch": 1.464935064935065,
"grad_norm": 0.3021141290664673,
"learning_rate": 4.768421052631579e-07,
"loss": 2.1027,
"step": 141
},
{
"epoch": 1.4753246753246754,
"grad_norm": 0.3091490864753723,
"learning_rate": 4.7649122807017547e-07,
"loss": 2.1051,
"step": 142
},
{
"epoch": 1.4857142857142858,
"grad_norm": 0.29339122772216797,
"learning_rate": 4.7614035087719296e-07,
"loss": 2.1045,
"step": 143
},
{
"epoch": 1.4961038961038962,
"grad_norm": 0.3079680800437927,
"learning_rate": 4.757894736842105e-07,
"loss": 2.1066,
"step": 144
},
{
"epoch": 1.5064935064935066,
"grad_norm": 0.2988393008708954,
"learning_rate": 4.7543859649122804e-07,
"loss": 2.1134,
"step": 145
},
{
"epoch": 1.516883116883117,
"grad_norm": 0.3002830147743225,
"learning_rate": 4.750877192982456e-07,
"loss": 2.1039,
"step": 146
},
{
"epoch": 1.5272727272727273,
"grad_norm": 0.3064285218715668,
"learning_rate": 4.747368421052632e-07,
"loss": 2.1319,
"step": 147
},
{
"epoch": 1.5376623376623377,
"grad_norm": 0.2974233031272888,
"learning_rate": 4.7438596491228066e-07,
"loss": 2.1008,
"step": 148
},
{
"epoch": 1.5480519480519481,
"grad_norm": 0.2973909080028534,
"learning_rate": 4.7403508771929826e-07,
"loss": 2.1089,
"step": 149
},
{
"epoch": 1.5584415584415585,
"grad_norm": 0.29095181822776794,
"learning_rate": 4.7368421052631574e-07,
"loss": 2.0994,
"step": 150
},
{
"epoch": 1.568831168831169,
"grad_norm": 0.30576032400131226,
"learning_rate": 4.733333333333333e-07,
"loss": 2.1019,
"step": 151
},
{
"epoch": 1.5792207792207793,
"grad_norm": 0.31595948338508606,
"learning_rate": 4.729824561403509e-07,
"loss": 2.1072,
"step": 152
},
{
"epoch": 1.5896103896103897,
"grad_norm": 0.3032829165458679,
"learning_rate": 4.7263157894736837e-07,
"loss": 2.1066,
"step": 153
},
{
"epoch": 1.6,
"grad_norm": 0.302549809217453,
"learning_rate": 4.7228070175438596e-07,
"loss": 2.0903,
"step": 154
},
{
"epoch": 1.6103896103896105,
"grad_norm": 0.30395421385765076,
"learning_rate": 4.719298245614035e-07,
"loss": 2.1183,
"step": 155
},
{
"epoch": 1.6207792207792209,
"grad_norm": 0.30267906188964844,
"learning_rate": 4.7157894736842104e-07,
"loss": 2.0934,
"step": 156
},
{
"epoch": 1.6311688311688313,
"grad_norm": 0.30717286467552185,
"learning_rate": 4.712280701754386e-07,
"loss": 2.0956,
"step": 157
},
{
"epoch": 1.6415584415584417,
"grad_norm": 0.30463624000549316,
"learning_rate": 4.7087719298245607e-07,
"loss": 2.091,
"step": 158
},
{
"epoch": 1.651948051948052,
"grad_norm": 0.3144147992134094,
"learning_rate": 4.7052631578947366e-07,
"loss": 2.1109,
"step": 159
},
{
"epoch": 1.6623376623376624,
"grad_norm": 0.30240705609321594,
"learning_rate": 4.701754385964912e-07,
"loss": 2.0907,
"step": 160
},
{
"epoch": 1.6727272727272728,
"grad_norm": 0.2900020182132721,
"learning_rate": 4.6982456140350874e-07,
"loss": 2.0762,
"step": 161
},
{
"epoch": 1.6831168831168832,
"grad_norm": 0.2979305684566498,
"learning_rate": 4.694736842105263e-07,
"loss": 2.0849,
"step": 162
},
{
"epoch": 1.6935064935064936,
"grad_norm": 0.3039534389972687,
"learning_rate": 4.691228070175439e-07,
"loss": 2.0975,
"step": 163
},
{
"epoch": 1.703896103896104,
"grad_norm": 0.30724868178367615,
"learning_rate": 4.6877192982456137e-07,
"loss": 2.1221,
"step": 164
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.3075260818004608,
"learning_rate": 4.6842105263157896e-07,
"loss": 2.0954,
"step": 165
},
{
"epoch": 1.7246753246753248,
"grad_norm": 0.3092438876628876,
"learning_rate": 4.6807017543859645e-07,
"loss": 2.1003,
"step": 166
},
{
"epoch": 1.7350649350649352,
"grad_norm": 0.3084464371204376,
"learning_rate": 4.67719298245614e-07,
"loss": 2.1137,
"step": 167
},
{
"epoch": 1.7454545454545456,
"grad_norm": 0.3065555989742279,
"learning_rate": 4.673684210526316e-07,
"loss": 2.1015,
"step": 168
},
{
"epoch": 1.755844155844156,
"grad_norm": 0.304202139377594,
"learning_rate": 4.6701754385964907e-07,
"loss": 2.097,
"step": 169
},
{
"epoch": 1.7662337662337664,
"grad_norm": 0.31230100989341736,
"learning_rate": 4.6666666666666666e-07,
"loss": 2.1061,
"step": 170
},
{
"epoch": 1.7766233766233768,
"grad_norm": 0.31154370307922363,
"learning_rate": 4.6631578947368415e-07,
"loss": 2.1021,
"step": 171
},
{
"epoch": 1.7870129870129872,
"grad_norm": 0.30575987696647644,
"learning_rate": 4.6596491228070174e-07,
"loss": 2.0911,
"step": 172
},
{
"epoch": 1.7974025974025976,
"grad_norm": 0.3098468780517578,
"learning_rate": 4.656140350877193e-07,
"loss": 2.0824,
"step": 173
},
{
"epoch": 1.807792207792208,
"grad_norm": 0.32056814432144165,
"learning_rate": 4.652631578947368e-07,
"loss": 2.0887,
"step": 174
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.3210814595222473,
"learning_rate": 4.6491228070175437e-07,
"loss": 2.1179,
"step": 175
},
{
"epoch": 1.8285714285714287,
"grad_norm": 0.31061846017837524,
"learning_rate": 4.645614035087719e-07,
"loss": 2.0973,
"step": 176
},
{
"epoch": 1.838961038961039,
"grad_norm": 0.3153719902038574,
"learning_rate": 4.6421052631578945e-07,
"loss": 2.1132,
"step": 177
},
{
"epoch": 1.8493506493506493,
"grad_norm": 0.3217187821865082,
"learning_rate": 4.63859649122807e-07,
"loss": 2.1069,
"step": 178
},
{
"epoch": 1.8597402597402597,
"grad_norm": 0.31842827796936035,
"learning_rate": 4.6350877192982453e-07,
"loss": 2.1114,
"step": 179
},
{
"epoch": 1.87012987012987,
"grad_norm": 0.31468620896339417,
"learning_rate": 4.6315789473684207e-07,
"loss": 2.0966,
"step": 180
},
{
"epoch": 1.8805194805194805,
"grad_norm": 0.3157198131084442,
"learning_rate": 4.6280701754385966e-07,
"loss": 2.1267,
"step": 181
},
{
"epoch": 1.8909090909090909,
"grad_norm": 0.32008498907089233,
"learning_rate": 4.6245614035087715e-07,
"loss": 2.0975,
"step": 182
},
{
"epoch": 1.9012987012987013,
"grad_norm": 0.314317911863327,
"learning_rate": 4.6210526315789475e-07,
"loss": 2.0871,
"step": 183
},
{
"epoch": 1.9116883116883117,
"grad_norm": 0.3036201298236847,
"learning_rate": 4.617543859649123e-07,
"loss": 2.0865,
"step": 184
},
{
"epoch": 1.922077922077922,
"grad_norm": 0.3069418668746948,
"learning_rate": 4.614035087719298e-07,
"loss": 2.084,
"step": 185
},
{
"epoch": 1.9324675324675324,
"grad_norm": 0.31745216250419617,
"learning_rate": 4.6105263157894737e-07,
"loss": 2.0997,
"step": 186
},
{
"epoch": 1.9428571428571428,
"grad_norm": 0.31132858991622925,
"learning_rate": 4.6070175438596486e-07,
"loss": 2.0687,
"step": 187
},
{
"epoch": 1.9532467532467532,
"grad_norm": 0.3204294443130493,
"learning_rate": 4.6035087719298245e-07,
"loss": 2.1131,
"step": 188
},
{
"epoch": 1.9636363636363636,
"grad_norm": 0.31889617443084717,
"learning_rate": 4.6e-07,
"loss": 2.1159,
"step": 189
},
{
"epoch": 1.974025974025974,
"grad_norm": 0.3156440854072571,
"learning_rate": 4.5964912280701753e-07,
"loss": 2.091,
"step": 190
},
{
"epoch": 1.9844155844155844,
"grad_norm": 0.3086424171924591,
"learning_rate": 4.5929824561403507e-07,
"loss": 2.0972,
"step": 191
},
{
"epoch": 1.9948051948051948,
"grad_norm": 0.31145980954170227,
"learning_rate": 4.5894736842105256e-07,
"loss": 2.0773,
"step": 192
},
{
"epoch": 1.9948051948051948,
"eval_loss": 2.108880043029785,
"eval_runtime": 9.2552,
"eval_samples_per_second": 2.701,
"eval_steps_per_second": 0.432,
"step": 192
},
{
"epoch": 2.005194805194805,
"grad_norm": 0.3105067014694214,
"learning_rate": 4.5859649122807015e-07,
"loss": 2.0807,
"step": 193
},
{
"epoch": 2.0155844155844154,
"grad_norm": 0.3223939538002014,
"learning_rate": 4.582456140350877e-07,
"loss": 2.1253,
"step": 194
},
{
"epoch": 2.0259740259740258,
"grad_norm": 0.3066651523113251,
"learning_rate": 4.5789473684210523e-07,
"loss": 2.079,
"step": 195
},
{
"epoch": 2.036363636363636,
"grad_norm": 0.31992414593696594,
"learning_rate": 4.575438596491228e-07,
"loss": 2.1183,
"step": 196
},
{
"epoch": 2.0467532467532465,
"grad_norm": 0.3262827396392822,
"learning_rate": 4.5719298245614037e-07,
"loss": 2.1356,
"step": 197
},
{
"epoch": 2.057142857142857,
"grad_norm": 0.31810563802719116,
"learning_rate": 4.5684210526315786e-07,
"loss": 2.0954,
"step": 198
},
{
"epoch": 2.0675324675324673,
"grad_norm": 0.3235313892364502,
"learning_rate": 4.5649122807017545e-07,
"loss": 2.1088,
"step": 199
},
{
"epoch": 2.0779220779220777,
"grad_norm": 0.3249002695083618,
"learning_rate": 4.5614035087719294e-07,
"loss": 2.1149,
"step": 200
},
{
"epoch": 2.088311688311688,
"grad_norm": 0.30616623163223267,
"learning_rate": 4.557894736842105e-07,
"loss": 2.0569,
"step": 201
},
{
"epoch": 2.0987012987012985,
"grad_norm": 0.320377916097641,
"learning_rate": 4.5543859649122807e-07,
"loss": 2.1122,
"step": 202
},
{
"epoch": 2.109090909090909,
"grad_norm": 0.3134180009365082,
"learning_rate": 4.5508771929824556e-07,
"loss": 2.0919,
"step": 203
},
{
"epoch": 2.1194805194805193,
"grad_norm": 0.32783976197242737,
"learning_rate": 4.5473684210526315e-07,
"loss": 2.0984,
"step": 204
},
{
"epoch": 2.1298701298701297,
"grad_norm": 0.32066580653190613,
"learning_rate": 4.543859649122807e-07,
"loss": 2.0876,
"step": 205
},
{
"epoch": 2.14025974025974,
"grad_norm": 0.32365262508392334,
"learning_rate": 4.5403508771929823e-07,
"loss": 2.1082,
"step": 206
},
{
"epoch": 2.1506493506493505,
"grad_norm": 0.3484063148498535,
"learning_rate": 4.536842105263158e-07,
"loss": 2.1351,
"step": 207
},
{
"epoch": 2.161038961038961,
"grad_norm": 0.3321819603443146,
"learning_rate": 4.5333333333333326e-07,
"loss": 2.0974,
"step": 208
},
{
"epoch": 2.1714285714285713,
"grad_norm": 0.32974711060523987,
"learning_rate": 4.5298245614035086e-07,
"loss": 2.1051,
"step": 209
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.319308876991272,
"learning_rate": 4.526315789473684e-07,
"loss": 2.0817,
"step": 210
},
{
"epoch": 2.192207792207792,
"grad_norm": 0.3337661325931549,
"learning_rate": 4.5228070175438594e-07,
"loss": 2.1134,
"step": 211
},
{
"epoch": 2.2025974025974024,
"grad_norm": 0.3277778625488281,
"learning_rate": 4.519298245614035e-07,
"loss": 2.09,
"step": 212
},
{
"epoch": 2.212987012987013,
"grad_norm": 0.31506606936454773,
"learning_rate": 4.5157894736842107e-07,
"loss": 2.1009,
"step": 213
},
{
"epoch": 2.2233766233766232,
"grad_norm": 0.34575071930885315,
"learning_rate": 4.5122807017543856e-07,
"loss": 2.1461,
"step": 214
},
{
"epoch": 2.2337662337662336,
"grad_norm": 0.3281601071357727,
"learning_rate": 4.5087719298245615e-07,
"loss": 2.0806,
"step": 215
},
{
"epoch": 2.244155844155844,
"grad_norm": 0.32512006163597107,
"learning_rate": 4.5052631578947364e-07,
"loss": 2.1075,
"step": 216
},
{
"epoch": 2.2545454545454544,
"grad_norm": 0.3208228647708893,
"learning_rate": 4.501754385964912e-07,
"loss": 2.0935,
"step": 217
},
{
"epoch": 2.264935064935065,
"grad_norm": 0.33443257212638855,
"learning_rate": 4.498245614035088e-07,
"loss": 2.1021,
"step": 218
},
{
"epoch": 2.275324675324675,
"grad_norm": 0.3280114531517029,
"learning_rate": 4.4947368421052626e-07,
"loss": 2.1027,
"step": 219
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.3224703073501587,
"learning_rate": 4.4912280701754386e-07,
"loss": 2.1239,
"step": 220
},
{
"epoch": 2.296103896103896,
"grad_norm": 0.34066352248191833,
"learning_rate": 4.4877192982456135e-07,
"loss": 2.0794,
"step": 221
},
{
"epoch": 2.3064935064935064,
"grad_norm": 0.3266121745109558,
"learning_rate": 4.4842105263157894e-07,
"loss": 2.115,
"step": 222
},
{
"epoch": 2.3168831168831168,
"grad_norm": 0.3360849618911743,
"learning_rate": 4.480701754385965e-07,
"loss": 2.1042,
"step": 223
},
{
"epoch": 2.327272727272727,
"grad_norm": 0.33068639039993286,
"learning_rate": 4.47719298245614e-07,
"loss": 2.1146,
"step": 224
},
{
"epoch": 2.3376623376623376,
"grad_norm": 0.32161960005760193,
"learning_rate": 4.4736842105263156e-07,
"loss": 2.0967,
"step": 225
},
{
"epoch": 2.348051948051948,
"grad_norm": 0.3327421545982361,
"learning_rate": 4.470175438596491e-07,
"loss": 2.097,
"step": 226
},
{
"epoch": 2.3584415584415583,
"grad_norm": 0.326388955116272,
"learning_rate": 4.4666666666666664e-07,
"loss": 2.0919,
"step": 227
},
{
"epoch": 2.3688311688311687,
"grad_norm": 0.3344230353832245,
"learning_rate": 4.463157894736842e-07,
"loss": 2.1358,
"step": 228
},
{
"epoch": 2.379220779220779,
"grad_norm": 0.33497416973114014,
"learning_rate": 4.459649122807017e-07,
"loss": 2.0969,
"step": 229
},
{
"epoch": 2.3896103896103895,
"grad_norm": 0.33529427647590637,
"learning_rate": 4.4561403508771927e-07,
"loss": 2.0841,
"step": 230
},
{
"epoch": 2.4,
"grad_norm": 0.3811129629611969,
"learning_rate": 4.4526315789473686e-07,
"loss": 2.097,
"step": 231
},
{
"epoch": 2.4103896103896103,
"grad_norm": 0.33154216408729553,
"learning_rate": 4.4491228070175435e-07,
"loss": 2.1155,
"step": 232
},
{
"epoch": 2.4207792207792207,
"grad_norm": 0.32626327872276306,
"learning_rate": 4.4456140350877194e-07,
"loss": 2.0988,
"step": 233
},
{
"epoch": 2.431168831168831,
"grad_norm": 0.33323419094085693,
"learning_rate": 4.442105263157895e-07,
"loss": 2.0891,
"step": 234
},
{
"epoch": 2.4415584415584415,
"grad_norm": 0.34010282158851624,
"learning_rate": 4.4385964912280697e-07,
"loss": 2.1,
"step": 235
},
{
"epoch": 2.451948051948052,
"grad_norm": 0.3339459002017975,
"learning_rate": 4.4350877192982456e-07,
"loss": 2.09,
"step": 236
},
{
"epoch": 2.4623376623376623,
"grad_norm": 0.33162543177604675,
"learning_rate": 4.4315789473684205e-07,
"loss": 2.0698,
"step": 237
},
{
"epoch": 2.4727272727272727,
"grad_norm": 0.3330386281013489,
"learning_rate": 4.4280701754385964e-07,
"loss": 2.0948,
"step": 238
},
{
"epoch": 2.483116883116883,
"grad_norm": 0.33064815402030945,
"learning_rate": 4.424561403508772e-07,
"loss": 2.0764,
"step": 239
},
{
"epoch": 2.4935064935064934,
"grad_norm": 0.3411155641078949,
"learning_rate": 4.421052631578947e-07,
"loss": 2.1208,
"step": 240
},
{
"epoch": 2.503896103896104,
"grad_norm": 0.3448623716831207,
"learning_rate": 4.4175438596491227e-07,
"loss": 2.1029,
"step": 241
},
{
"epoch": 2.5142857142857142,
"grad_norm": 0.3325323164463043,
"learning_rate": 4.4140350877192975e-07,
"loss": 2.0744,
"step": 242
},
{
"epoch": 2.5246753246753246,
"grad_norm": 0.3473738133907318,
"learning_rate": 4.4105263157894735e-07,
"loss": 2.0711,
"step": 243
},
{
"epoch": 2.535064935064935,
"grad_norm": 0.33442118763923645,
"learning_rate": 4.407017543859649e-07,
"loss": 2.0977,
"step": 244
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.32157817482948303,
"learning_rate": 4.4035087719298243e-07,
"loss": 2.0851,
"step": 245
},
{
"epoch": 2.555844155844156,
"grad_norm": 0.3335091471672058,
"learning_rate": 4.3999999999999997e-07,
"loss": 2.1119,
"step": 246
},
{
"epoch": 2.566233766233766,
"grad_norm": 0.32816439867019653,
"learning_rate": 4.3964912280701756e-07,
"loss": 2.0749,
"step": 247
},
{
"epoch": 2.5766233766233766,
"grad_norm": 0.3414842486381531,
"learning_rate": 4.3929824561403505e-07,
"loss": 2.1005,
"step": 248
},
{
"epoch": 2.587012987012987,
"grad_norm": 0.33331358432769775,
"learning_rate": 4.3894736842105264e-07,
"loss": 2.0987,
"step": 249
},
{
"epoch": 2.5974025974025974,
"grad_norm": 0.3305921256542206,
"learning_rate": 4.3859649122807013e-07,
"loss": 2.1071,
"step": 250
},
{
"epoch": 2.6077922077922078,
"grad_norm": 0.3360355794429779,
"learning_rate": 4.3824561403508767e-07,
"loss": 2.0854,
"step": 251
},
{
"epoch": 2.618181818181818,
"grad_norm": 0.34479108452796936,
"learning_rate": 4.3789473684210527e-07,
"loss": 2.0948,
"step": 252
},
{
"epoch": 2.6285714285714286,
"grad_norm": 0.3384932577610016,
"learning_rate": 4.3754385964912275e-07,
"loss": 2.0911,
"step": 253
},
{
"epoch": 2.638961038961039,
"grad_norm": 0.3388522267341614,
"learning_rate": 4.3719298245614035e-07,
"loss": 2.1016,
"step": 254
},
{
"epoch": 2.6493506493506493,
"grad_norm": 0.3315964341163635,
"learning_rate": 4.368421052631579e-07,
"loss": 2.0935,
"step": 255
},
{
"epoch": 2.6597402597402597,
"grad_norm": 0.34984835982322693,
"learning_rate": 4.3649122807017543e-07,
"loss": 2.1237,
"step": 256
},
{
"epoch": 2.67012987012987,
"grad_norm": 0.35086217522621155,
"learning_rate": 4.3614035087719297e-07,
"loss": 2.1158,
"step": 257
},
{
"epoch": 2.6805194805194805,
"grad_norm": 0.3384174108505249,
"learning_rate": 4.357894736842105e-07,
"loss": 2.0945,
"step": 258
},
{
"epoch": 2.690909090909091,
"grad_norm": 0.33980512619018555,
"learning_rate": 4.3543859649122805e-07,
"loss": 2.1045,
"step": 259
},
{
"epoch": 2.7012987012987013,
"grad_norm": 0.3519754111766815,
"learning_rate": 4.350877192982456e-07,
"loss": 2.1155,
"step": 260
},
{
"epoch": 2.7116883116883117,
"grad_norm": 0.3461025655269623,
"learning_rate": 4.3473684210526313e-07,
"loss": 2.0973,
"step": 261
},
{
"epoch": 2.722077922077922,
"grad_norm": 0.35459649562835693,
"learning_rate": 4.343859649122807e-07,
"loss": 2.0845,
"step": 262
},
{
"epoch": 2.7324675324675325,
"grad_norm": 0.3461463451385498,
"learning_rate": 4.340350877192982e-07,
"loss": 2.087,
"step": 263
},
{
"epoch": 2.742857142857143,
"grad_norm": 0.3309321999549866,
"learning_rate": 4.3368421052631576e-07,
"loss": 2.1044,
"step": 264
},
{
"epoch": 2.7532467532467533,
"grad_norm": 0.349258154630661,
"learning_rate": 4.3333333333333335e-07,
"loss": 2.101,
"step": 265
},
{
"epoch": 2.7636363636363637,
"grad_norm": 0.3515391945838928,
"learning_rate": 4.3298245614035084e-07,
"loss": 2.1061,
"step": 266
},
{
"epoch": 2.774025974025974,
"grad_norm": 0.34546464681625366,
"learning_rate": 4.326315789473684e-07,
"loss": 2.112,
"step": 267
},
{
"epoch": 2.7844155844155845,
"grad_norm": 0.34706681966781616,
"learning_rate": 4.3228070175438597e-07,
"loss": 2.1201,
"step": 268
},
{
"epoch": 2.794805194805195,
"grad_norm": 0.34747087955474854,
"learning_rate": 4.3192982456140346e-07,
"loss": 2.0849,
"step": 269
},
{
"epoch": 2.8051948051948052,
"grad_norm": 0.34778711199760437,
"learning_rate": 4.3157894736842105e-07,
"loss": 2.092,
"step": 270
},
{
"epoch": 2.8155844155844156,
"grad_norm": 0.34066757559776306,
"learning_rate": 4.3122807017543854e-07,
"loss": 2.0752,
"step": 271
},
{
"epoch": 2.825974025974026,
"grad_norm": 0.342899352312088,
"learning_rate": 4.3087719298245613e-07,
"loss": 2.1058,
"step": 272
},
{
"epoch": 2.8363636363636364,
"grad_norm": 0.34262892603874207,
"learning_rate": 4.305263157894737e-07,
"loss": 2.0855,
"step": 273
},
{
"epoch": 2.846753246753247,
"grad_norm": 0.34279361367225647,
"learning_rate": 4.301754385964912e-07,
"loss": 2.0939,
"step": 274
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.34948858618736267,
"learning_rate": 4.2982456140350876e-07,
"loss": 2.1111,
"step": 275
},
{
"epoch": 2.8675324675324676,
"grad_norm": 0.34498292207717896,
"learning_rate": 4.294736842105263e-07,
"loss": 2.0952,
"step": 276
},
{
"epoch": 2.877922077922078,
"grad_norm": 0.34956324100494385,
"learning_rate": 4.2912280701754384e-07,
"loss": 2.1095,
"step": 277
},
{
"epoch": 2.8883116883116884,
"grad_norm": 0.341450035572052,
"learning_rate": 4.287719298245614e-07,
"loss": 2.0893,
"step": 278
},
{
"epoch": 2.898701298701299,
"grad_norm": 0.34633004665374756,
"learning_rate": 4.284210526315789e-07,
"loss": 2.1014,
"step": 279
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.35773593187332153,
"learning_rate": 4.2807017543859646e-07,
"loss": 2.1367,
"step": 280
},
{
"epoch": 2.9194805194805196,
"grad_norm": 0.34510213136672974,
"learning_rate": 4.2771929824561405e-07,
"loss": 2.0917,
"step": 281
},
{
"epoch": 2.92987012987013,
"grad_norm": 0.34937089681625366,
"learning_rate": 4.2736842105263154e-07,
"loss": 2.078,
"step": 282
},
{
"epoch": 2.9402597402597404,
"grad_norm": 0.35117366909980774,
"learning_rate": 4.2701754385964913e-07,
"loss": 2.0762,
"step": 283
},
{
"epoch": 2.9506493506493507,
"grad_norm": 0.3488243520259857,
"learning_rate": 4.266666666666667e-07,
"loss": 2.1195,
"step": 284
},
{
"epoch": 2.961038961038961,
"grad_norm": 0.3523004949092865,
"learning_rate": 4.2631578947368416e-07,
"loss": 2.093,
"step": 285
},
{
"epoch": 2.9714285714285715,
"grad_norm": 0.35649630427360535,
"learning_rate": 4.2596491228070176e-07,
"loss": 2.0939,
"step": 286
},
{
"epoch": 2.981818181818182,
"grad_norm": 0.3453286588191986,
"learning_rate": 4.2561403508771924e-07,
"loss": 2.077,
"step": 287
},
{
"epoch": 2.9922077922077923,
"grad_norm": 0.3536374270915985,
"learning_rate": 4.2526315789473684e-07,
"loss": 2.1049,
"step": 288
},
{
"epoch": 2.9922077922077923,
"eval_loss": 2.103614568710327,
"eval_runtime": 9.2191,
"eval_samples_per_second": 2.712,
"eval_steps_per_second": 0.434,
"step": 288
},
{
"epoch": 3.0025974025974027,
"grad_norm": 0.34745439887046814,
"learning_rate": 4.249122807017544e-07,
"loss": 2.1012,
"step": 289
},
{
"epoch": 3.012987012987013,
"grad_norm": 0.3465849459171295,
"learning_rate": 4.245614035087719e-07,
"loss": 2.0759,
"step": 290
},
{
"epoch": 3.0233766233766235,
"grad_norm": 0.34689581394195557,
"learning_rate": 4.2421052631578946e-07,
"loss": 2.0887,
"step": 291
},
{
"epoch": 3.033766233766234,
"grad_norm": 0.3415219485759735,
"learning_rate": 4.2385964912280695e-07,
"loss": 2.0486,
"step": 292
},
{
"epoch": 3.0441558441558443,
"grad_norm": 0.34754109382629395,
"learning_rate": 4.2350877192982454e-07,
"loss": 2.0737,
"step": 293
},
{
"epoch": 3.0545454545454547,
"grad_norm": 0.3473778963088989,
"learning_rate": 4.231578947368421e-07,
"loss": 2.0727,
"step": 294
},
{
"epoch": 3.064935064935065,
"grad_norm": 0.3525889217853546,
"learning_rate": 4.228070175438596e-07,
"loss": 2.0836,
"step": 295
},
{
"epoch": 3.0753246753246755,
"grad_norm": 0.34054872393608093,
"learning_rate": 4.2245614035087716e-07,
"loss": 2.0951,
"step": 296
},
{
"epoch": 3.085714285714286,
"grad_norm": 0.35042187571525574,
"learning_rate": 4.2210526315789476e-07,
"loss": 2.0902,
"step": 297
},
{
"epoch": 3.0961038961038962,
"grad_norm": 0.33843863010406494,
"learning_rate": 4.2175438596491225e-07,
"loss": 2.1046,
"step": 298
},
{
"epoch": 3.1064935064935066,
"grad_norm": 0.36478835344314575,
"learning_rate": 4.2140350877192984e-07,
"loss": 2.0959,
"step": 299
},
{
"epoch": 3.116883116883117,
"grad_norm": 0.36838847398757935,
"learning_rate": 4.2105263157894733e-07,
"loss": 2.1017,
"step": 300
},
{
"epoch": 3.1272727272727274,
"grad_norm": 0.3492492735385895,
"learning_rate": 4.2070175438596487e-07,
"loss": 2.0701,
"step": 301
},
{
"epoch": 3.137662337662338,
"grad_norm": 0.3571326732635498,
"learning_rate": 4.2035087719298246e-07,
"loss": 2.1061,
"step": 302
},
{
"epoch": 3.148051948051948,
"grad_norm": 0.35111817717552185,
"learning_rate": 4.1999999999999995e-07,
"loss": 2.1311,
"step": 303
},
{
"epoch": 3.1584415584415586,
"grad_norm": 0.3499947786331177,
"learning_rate": 4.1964912280701754e-07,
"loss": 2.0856,
"step": 304
},
{
"epoch": 3.168831168831169,
"grad_norm": 0.3530219495296478,
"learning_rate": 4.192982456140351e-07,
"loss": 2.0947,
"step": 305
},
{
"epoch": 3.1792207792207794,
"grad_norm": 0.3510328233242035,
"learning_rate": 4.189473684210526e-07,
"loss": 2.1088,
"step": 306
},
{
"epoch": 3.18961038961039,
"grad_norm": 0.36004605889320374,
"learning_rate": 4.1859649122807017e-07,
"loss": 2.1258,
"step": 307
},
{
"epoch": 3.2,
"grad_norm": 0.35782137513160706,
"learning_rate": 4.182456140350877e-07,
"loss": 2.1167,
"step": 308
},
{
"epoch": 3.2103896103896106,
"grad_norm": 0.35306283831596375,
"learning_rate": 4.1789473684210525e-07,
"loss": 2.0918,
"step": 309
},
{
"epoch": 3.220779220779221,
"grad_norm": 0.35812172293663025,
"learning_rate": 4.175438596491228e-07,
"loss": 2.0857,
"step": 310
},
{
"epoch": 3.2311688311688314,
"grad_norm": 0.35852885246276855,
"learning_rate": 4.1719298245614033e-07,
"loss": 2.0866,
"step": 311
},
{
"epoch": 3.2415584415584417,
"grad_norm": 0.3610192537307739,
"learning_rate": 4.1684210526315787e-07,
"loss": 2.0741,
"step": 312
},
{
"epoch": 3.2519480519480517,
"grad_norm": 0.36115092039108276,
"learning_rate": 4.164912280701754e-07,
"loss": 2.0727,
"step": 313
},
{
"epoch": 3.2623376623376625,
"grad_norm": 0.35251882672309875,
"learning_rate": 4.1614035087719295e-07,
"loss": 2.0991,
"step": 314
},
{
"epoch": 3.2727272727272725,
"grad_norm": 0.35901498794555664,
"learning_rate": 4.1578947368421054e-07,
"loss": 2.1008,
"step": 315
},
{
"epoch": 3.2831168831168833,
"grad_norm": 0.3722142279148102,
"learning_rate": 4.1543859649122803e-07,
"loss": 2.1037,
"step": 316
},
{
"epoch": 3.2935064935064933,
"grad_norm": 0.3581855893135071,
"learning_rate": 4.1508771929824557e-07,
"loss": 2.0989,
"step": 317
},
{
"epoch": 3.303896103896104,
"grad_norm": 0.3623902499675751,
"learning_rate": 4.1473684210526317e-07,
"loss": 2.0989,
"step": 318
},
{
"epoch": 3.314285714285714,
"grad_norm": 0.3492812514305115,
"learning_rate": 4.1438596491228065e-07,
"loss": 2.1105,
"step": 319
},
{
"epoch": 3.324675324675325,
"grad_norm": 0.36448824405670166,
"learning_rate": 4.1403508771929825e-07,
"loss": 2.097,
"step": 320
},
{
"epoch": 3.335064935064935,
"grad_norm": 0.36863312125205994,
"learning_rate": 4.1368421052631574e-07,
"loss": 2.1136,
"step": 321
},
{
"epoch": 3.3454545454545457,
"grad_norm": 0.3554861545562744,
"learning_rate": 4.1333333333333333e-07,
"loss": 2.0715,
"step": 322
},
{
"epoch": 3.3558441558441556,
"grad_norm": 0.36161860823631287,
"learning_rate": 4.1298245614035087e-07,
"loss": 2.0668,
"step": 323
},
{
"epoch": 3.3662337662337665,
"grad_norm": 0.3509725332260132,
"learning_rate": 4.126315789473684e-07,
"loss": 2.0771,
"step": 324
},
{
"epoch": 3.3766233766233764,
"grad_norm": 0.3517126739025116,
"learning_rate": 4.1228070175438595e-07,
"loss": 2.0805,
"step": 325
},
{
"epoch": 3.3870129870129873,
"grad_norm": 0.35390713810920715,
"learning_rate": 4.119298245614035e-07,
"loss": 2.0988,
"step": 326
},
{
"epoch": 3.397402597402597,
"grad_norm": 0.36263352632522583,
"learning_rate": 4.1157894736842103e-07,
"loss": 2.0936,
"step": 327
},
{
"epoch": 3.407792207792208,
"grad_norm": 0.3613211214542389,
"learning_rate": 4.1122807017543857e-07,
"loss": 2.1124,
"step": 328
},
{
"epoch": 3.418181818181818,
"grad_norm": 0.3665166199207306,
"learning_rate": 4.108771929824561e-07,
"loss": 2.0896,
"step": 329
},
{
"epoch": 3.4285714285714284,
"grad_norm": 0.36971572041511536,
"learning_rate": 4.1052631578947365e-07,
"loss": 2.0941,
"step": 330
},
{
"epoch": 3.4389610389610388,
"grad_norm": 0.3668130338191986,
"learning_rate": 4.1017543859649125e-07,
"loss": 2.0919,
"step": 331
},
{
"epoch": 3.449350649350649,
"grad_norm": 0.36285045742988586,
"learning_rate": 4.0982456140350874e-07,
"loss": 2.0721,
"step": 332
},
{
"epoch": 3.4597402597402596,
"grad_norm": 0.3531608581542969,
"learning_rate": 4.0947368421052633e-07,
"loss": 2.0658,
"step": 333
},
{
"epoch": 3.47012987012987,
"grad_norm": 0.35173851251602173,
"learning_rate": 4.091228070175438e-07,
"loss": 2.0844,
"step": 334
},
{
"epoch": 3.4805194805194803,
"grad_norm": 0.3617774248123169,
"learning_rate": 4.0877192982456136e-07,
"loss": 2.0994,
"step": 335
},
{
"epoch": 3.4909090909090907,
"grad_norm": 0.3606766164302826,
"learning_rate": 4.0842105263157895e-07,
"loss": 2.1015,
"step": 336
},
{
"epoch": 3.501298701298701,
"grad_norm": 0.3587430417537689,
"learning_rate": 4.0807017543859644e-07,
"loss": 2.0928,
"step": 337
},
{
"epoch": 3.5116883116883115,
"grad_norm": 0.3543533682823181,
"learning_rate": 4.0771929824561403e-07,
"loss": 2.0947,
"step": 338
},
{
"epoch": 3.522077922077922,
"grad_norm": 0.35451260209083557,
"learning_rate": 4.073684210526316e-07,
"loss": 2.098,
"step": 339
},
{
"epoch": 3.5324675324675323,
"grad_norm": 0.35621732473373413,
"learning_rate": 4.070175438596491e-07,
"loss": 2.0926,
"step": 340
},
{
"epoch": 3.5428571428571427,
"grad_norm": 0.3654091954231262,
"learning_rate": 4.0666666666666666e-07,
"loss": 2.113,
"step": 341
},
{
"epoch": 3.553246753246753,
"grad_norm": 0.40453559160232544,
"learning_rate": 4.0631578947368414e-07,
"loss": 2.1097,
"step": 342
},
{
"epoch": 3.5636363636363635,
"grad_norm": 0.3696524202823639,
"learning_rate": 4.0596491228070174e-07,
"loss": 2.1117,
"step": 343
},
{
"epoch": 3.574025974025974,
"grad_norm": 0.36527544260025024,
"learning_rate": 4.056140350877193e-07,
"loss": 2.0999,
"step": 344
},
{
"epoch": 3.5844155844155843,
"grad_norm": 0.3624621033668518,
"learning_rate": 4.052631578947368e-07,
"loss": 2.1076,
"step": 345
},
{
"epoch": 3.5948051948051947,
"grad_norm": 0.3703259527683258,
"learning_rate": 4.0491228070175436e-07,
"loss": 2.0913,
"step": 346
},
{
"epoch": 3.605194805194805,
"grad_norm": 0.36622118949890137,
"learning_rate": 4.0456140350877195e-07,
"loss": 2.099,
"step": 347
},
{
"epoch": 3.6155844155844155,
"grad_norm": 0.36391839385032654,
"learning_rate": 4.0421052631578944e-07,
"loss": 2.0866,
"step": 348
},
{
"epoch": 3.625974025974026,
"grad_norm": 0.3735998272895813,
"learning_rate": 4.0385964912280703e-07,
"loss": 2.1034,
"step": 349
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.3779623508453369,
"learning_rate": 4.035087719298245e-07,
"loss": 2.0702,
"step": 350
},
{
"epoch": 3.6467532467532466,
"grad_norm": 0.3697824776172638,
"learning_rate": 4.0315789473684206e-07,
"loss": 2.1214,
"step": 351
},
{
"epoch": 3.657142857142857,
"grad_norm": 0.3565923273563385,
"learning_rate": 4.0280701754385966e-07,
"loss": 2.0978,
"step": 352
},
{
"epoch": 3.6675324675324674,
"grad_norm": 0.37301158905029297,
"learning_rate": 4.0245614035087714e-07,
"loss": 2.1241,
"step": 353
},
{
"epoch": 3.677922077922078,
"grad_norm": 0.371933251619339,
"learning_rate": 4.0210526315789474e-07,
"loss": 2.0836,
"step": 354
},
{
"epoch": 3.688311688311688,
"grad_norm": 0.38711678981781006,
"learning_rate": 4.017543859649123e-07,
"loss": 2.0836,
"step": 355
},
{
"epoch": 3.6987012987012986,
"grad_norm": 0.37659522891044617,
"learning_rate": 4.014035087719298e-07,
"loss": 2.0925,
"step": 356
},
{
"epoch": 3.709090909090909,
"grad_norm": 0.37264522910118103,
"learning_rate": 4.0105263157894736e-07,
"loss": 2.0803,
"step": 357
},
{
"epoch": 3.7194805194805194,
"grad_norm": 0.376504510641098,
"learning_rate": 4.007017543859649e-07,
"loss": 2.0925,
"step": 358
},
{
"epoch": 3.72987012987013,
"grad_norm": 0.3663710653781891,
"learning_rate": 4.0035087719298244e-07,
"loss": 2.1138,
"step": 359
},
{
"epoch": 3.74025974025974,
"grad_norm": 0.3685579001903534,
"learning_rate": 4e-07,
"loss": 2.0498,
"step": 360
},
{
"epoch": 3.7506493506493506,
"grad_norm": 0.3644115626811981,
"learning_rate": 3.996491228070175e-07,
"loss": 2.1242,
"step": 361
},
{
"epoch": 3.761038961038961,
"grad_norm": 0.3706153333187103,
"learning_rate": 3.9929824561403506e-07,
"loss": 2.0872,
"step": 362
},
{
"epoch": 3.7714285714285714,
"grad_norm": 0.3608177900314331,
"learning_rate": 3.989473684210526e-07,
"loss": 2.0553,
"step": 363
},
{
"epoch": 3.7818181818181817,
"grad_norm": 0.36829179525375366,
"learning_rate": 3.9859649122807014e-07,
"loss": 2.1001,
"step": 364
},
{
"epoch": 3.792207792207792,
"grad_norm": 0.3892737925052643,
"learning_rate": 3.9824561403508774e-07,
"loss": 2.0975,
"step": 365
},
{
"epoch": 3.8025974025974025,
"grad_norm": 0.3626904785633087,
"learning_rate": 3.978947368421052e-07,
"loss": 2.081,
"step": 366
},
{
"epoch": 3.812987012987013,
"grad_norm": 0.382160484790802,
"learning_rate": 3.9754385964912277e-07,
"loss": 2.095,
"step": 367
},
{
"epoch": 3.8233766233766233,
"grad_norm": 0.3701639473438263,
"learning_rate": 3.9719298245614036e-07,
"loss": 2.0755,
"step": 368
},
{
"epoch": 3.8337662337662337,
"grad_norm": 0.3776034712791443,
"learning_rate": 3.9684210526315785e-07,
"loss": 2.1052,
"step": 369
},
{
"epoch": 3.844155844155844,
"grad_norm": 0.3688242733478546,
"learning_rate": 3.9649122807017544e-07,
"loss": 2.085,
"step": 370
},
{
"epoch": 3.8545454545454545,
"grad_norm": 0.3863198161125183,
"learning_rate": 3.9614035087719293e-07,
"loss": 2.1112,
"step": 371
},
{
"epoch": 3.864935064935065,
"grad_norm": 0.37687695026397705,
"learning_rate": 3.957894736842105e-07,
"loss": 2.102,
"step": 372
},
{
"epoch": 3.8753246753246753,
"grad_norm": 0.3751150071620941,
"learning_rate": 3.9543859649122806e-07,
"loss": 2.1052,
"step": 373
},
{
"epoch": 3.8857142857142857,
"grad_norm": 0.3695361018180847,
"learning_rate": 3.950877192982456e-07,
"loss": 2.0787,
"step": 374
},
{
"epoch": 3.896103896103896,
"grad_norm": 0.3684975802898407,
"learning_rate": 3.9473684210526315e-07,
"loss": 2.0708,
"step": 375
},
{
"epoch": 3.9064935064935065,
"grad_norm": 0.3755075931549072,
"learning_rate": 3.943859649122807e-07,
"loss": 2.0886,
"step": 376
},
{
"epoch": 3.916883116883117,
"grad_norm": 0.3694201707839966,
"learning_rate": 3.9403508771929823e-07,
"loss": 2.0892,
"step": 377
},
{
"epoch": 3.9272727272727272,
"grad_norm": 0.3803527057170868,
"learning_rate": 3.9368421052631577e-07,
"loss": 2.0866,
"step": 378
},
{
"epoch": 3.9376623376623376,
"grad_norm": 0.3805652856826782,
"learning_rate": 3.933333333333333e-07,
"loss": 2.1028,
"step": 379
},
{
"epoch": 3.948051948051948,
"grad_norm": 0.3854037821292877,
"learning_rate": 3.9298245614035085e-07,
"loss": 2.0945,
"step": 380
},
{
"epoch": 3.9584415584415584,
"grad_norm": 0.37754347920417786,
"learning_rate": 3.9263157894736844e-07,
"loss": 2.0963,
"step": 381
},
{
"epoch": 3.968831168831169,
"grad_norm": 0.3717576265335083,
"learning_rate": 3.9228070175438593e-07,
"loss": 2.1052,
"step": 382
},
{
"epoch": 3.979220779220779,
"grad_norm": 0.35753440856933594,
"learning_rate": 3.919298245614035e-07,
"loss": 2.0813,
"step": 383
},
{
"epoch": 3.9896103896103896,
"grad_norm": 0.3805282413959503,
"learning_rate": 3.91578947368421e-07,
"loss": 2.0932,
"step": 384
},
{
"epoch": 4.0,
"grad_norm": 0.3768204152584076,
"learning_rate": 3.9122807017543855e-07,
"loss": 2.1048,
"step": 385
},
{
"epoch": 4.0,
"eval_loss": 2.096661329269409,
"eval_runtime": 9.2491,
"eval_samples_per_second": 2.703,
"eval_steps_per_second": 0.432,
"step": 385
},
{
"epoch": 4.01038961038961,
"grad_norm": 0.3841288685798645,
"learning_rate": 3.9087719298245615e-07,
"loss": 2.1132,
"step": 386
},
{
"epoch": 4.020779220779221,
"grad_norm": 0.3716009855270386,
"learning_rate": 3.9052631578947363e-07,
"loss": 2.1147,
"step": 387
},
{
"epoch": 4.031168831168831,
"grad_norm": 0.37986499071121216,
"learning_rate": 3.9017543859649123e-07,
"loss": 2.085,
"step": 388
},
{
"epoch": 4.041558441558442,
"grad_norm": 0.37856024503707886,
"learning_rate": 3.8982456140350877e-07,
"loss": 2.0922,
"step": 389
},
{
"epoch": 4.0519480519480515,
"grad_norm": 0.3767644464969635,
"learning_rate": 3.894736842105263e-07,
"loss": 2.0762,
"step": 390
},
{
"epoch": 4.062337662337662,
"grad_norm": 0.3725319802761078,
"learning_rate": 3.8912280701754385e-07,
"loss": 2.0878,
"step": 391
},
{
"epoch": 4.072727272727272,
"grad_norm": 0.35704779624938965,
"learning_rate": 3.8877192982456134e-07,
"loss": 2.0943,
"step": 392
},
{
"epoch": 4.083116883116883,
"grad_norm": 0.3813813626766205,
"learning_rate": 3.8842105263157893e-07,
"loss": 2.0769,
"step": 393
},
{
"epoch": 4.093506493506493,
"grad_norm": 0.3729550540447235,
"learning_rate": 3.8807017543859647e-07,
"loss": 2.0747,
"step": 394
},
{
"epoch": 4.103896103896104,
"grad_norm": 0.37814322113990784,
"learning_rate": 3.87719298245614e-07,
"loss": 2.0834,
"step": 395
},
{
"epoch": 4.114285714285714,
"grad_norm": 0.3847734034061432,
"learning_rate": 3.8736842105263155e-07,
"loss": 2.0945,
"step": 396
},
{
"epoch": 4.124675324675325,
"grad_norm": 0.3798567056655884,
"learning_rate": 3.8701754385964915e-07,
"loss": 2.0756,
"step": 397
},
{
"epoch": 4.135064935064935,
"grad_norm": 0.3700884282588959,
"learning_rate": 3.8666666666666664e-07,
"loss": 2.0621,
"step": 398
},
{
"epoch": 4.1454545454545455,
"grad_norm": 0.36993175745010376,
"learning_rate": 3.8631578947368423e-07,
"loss": 2.1099,
"step": 399
},
{
"epoch": 4.1558441558441555,
"grad_norm": 0.380310595035553,
"learning_rate": 3.859649122807017e-07,
"loss": 2.0981,
"step": 400
},
{
"epoch": 4.166233766233766,
"grad_norm": 0.38853439688682556,
"learning_rate": 3.8561403508771926e-07,
"loss": 2.1049,
"step": 401
},
{
"epoch": 4.176623376623376,
"grad_norm": 0.3776048421859741,
"learning_rate": 3.8526315789473685e-07,
"loss": 2.0446,
"step": 402
},
{
"epoch": 4.187012987012987,
"grad_norm": 0.38195568323135376,
"learning_rate": 3.8491228070175434e-07,
"loss": 2.0813,
"step": 403
},
{
"epoch": 4.197402597402597,
"grad_norm": 0.373125284910202,
"learning_rate": 3.8456140350877193e-07,
"loss": 2.0903,
"step": 404
},
{
"epoch": 4.207792207792208,
"grad_norm": 0.3781803548336029,
"learning_rate": 3.842105263157894e-07,
"loss": 2.093,
"step": 405
},
{
"epoch": 4.218181818181818,
"grad_norm": 0.38378608226776123,
"learning_rate": 3.83859649122807e-07,
"loss": 2.0772,
"step": 406
},
{
"epoch": 4.228571428571429,
"grad_norm": 0.3815755248069763,
"learning_rate": 3.8350877192982455e-07,
"loss": 2.0876,
"step": 407
},
{
"epoch": 4.238961038961039,
"grad_norm": 0.3809583783149719,
"learning_rate": 3.831578947368421e-07,
"loss": 2.0631,
"step": 408
},
{
"epoch": 4.249350649350649,
"grad_norm": 0.3809110224246979,
"learning_rate": 3.8280701754385964e-07,
"loss": 2.1069,
"step": 409
},
{
"epoch": 4.259740259740259,
"grad_norm": 0.37152138352394104,
"learning_rate": 3.824561403508772e-07,
"loss": 2.0738,
"step": 410
},
{
"epoch": 4.27012987012987,
"grad_norm": 0.3761196434497833,
"learning_rate": 3.821052631578947e-07,
"loss": 2.0994,
"step": 411
},
{
"epoch": 4.28051948051948,
"grad_norm": 0.39031481742858887,
"learning_rate": 3.8175438596491226e-07,
"loss": 2.0857,
"step": 412
},
{
"epoch": 4.290909090909091,
"grad_norm": 0.37237513065338135,
"learning_rate": 3.814035087719298e-07,
"loss": 2.0844,
"step": 413
},
{
"epoch": 4.301298701298701,
"grad_norm": 0.38423943519592285,
"learning_rate": 3.8105263157894734e-07,
"loss": 2.126,
"step": 414
},
{
"epoch": 4.311688311688312,
"grad_norm": 0.36542361974716187,
"learning_rate": 3.8070175438596493e-07,
"loss": 2.0705,
"step": 415
},
{
"epoch": 4.322077922077922,
"grad_norm": 0.36861154437065125,
"learning_rate": 3.803508771929824e-07,
"loss": 2.0681,
"step": 416
},
{
"epoch": 4.332467532467533,
"grad_norm": 0.3783316910266876,
"learning_rate": 3.7999999999999996e-07,
"loss": 2.0777,
"step": 417
},
{
"epoch": 4.3428571428571425,
"grad_norm": 0.38323143124580383,
"learning_rate": 3.7964912280701756e-07,
"loss": 2.0952,
"step": 418
},
{
"epoch": 4.353246753246753,
"grad_norm": 0.3862488269805908,
"learning_rate": 3.7929824561403504e-07,
"loss": 2.0927,
"step": 419
},
{
"epoch": 4.363636363636363,
"grad_norm": 0.38100945949554443,
"learning_rate": 3.7894736842105264e-07,
"loss": 2.1043,
"step": 420
},
{
"epoch": 4.374025974025974,
"grad_norm": 0.38466402888298035,
"learning_rate": 3.785964912280701e-07,
"loss": 2.0906,
"step": 421
},
{
"epoch": 4.384415584415584,
"grad_norm": 0.37953078746795654,
"learning_rate": 3.782456140350877e-07,
"loss": 2.0706,
"step": 422
},
{
"epoch": 4.394805194805195,
"grad_norm": 0.3823856711387634,
"learning_rate": 3.7789473684210526e-07,
"loss": 2.0951,
"step": 423
},
{
"epoch": 4.405194805194805,
"grad_norm": 0.37538549304008484,
"learning_rate": 3.775438596491228e-07,
"loss": 2.0771,
"step": 424
},
{
"epoch": 4.415584415584416,
"grad_norm": 0.3802937865257263,
"learning_rate": 3.7719298245614034e-07,
"loss": 2.078,
"step": 425
},
{
"epoch": 4.425974025974026,
"grad_norm": 0.3733079433441162,
"learning_rate": 3.7684210526315783e-07,
"loss": 2.0799,
"step": 426
},
{
"epoch": 4.4363636363636365,
"grad_norm": 0.37729039788246155,
"learning_rate": 3.764912280701754e-07,
"loss": 2.1051,
"step": 427
},
{
"epoch": 4.4467532467532465,
"grad_norm": 0.3915861248970032,
"learning_rate": 3.7614035087719296e-07,
"loss": 2.0927,
"step": 428
},
{
"epoch": 4.457142857142857,
"grad_norm": 0.38771378993988037,
"learning_rate": 3.757894736842105e-07,
"loss": 2.0989,
"step": 429
},
{
"epoch": 4.467532467532467,
"grad_norm": 0.3854687213897705,
"learning_rate": 3.7543859649122804e-07,
"loss": 2.0984,
"step": 430
},
{
"epoch": 4.477922077922078,
"grad_norm": 0.3793568015098572,
"learning_rate": 3.7508771929824564e-07,
"loss": 2.0804,
"step": 431
},
{
"epoch": 4.488311688311688,
"grad_norm": 0.39430853724479675,
"learning_rate": 3.747368421052631e-07,
"loss": 2.0985,
"step": 432
},
{
"epoch": 4.498701298701299,
"grad_norm": 0.3847366273403168,
"learning_rate": 3.743859649122807e-07,
"loss": 2.0849,
"step": 433
},
{
"epoch": 4.509090909090909,
"grad_norm": 0.374398797750473,
"learning_rate": 3.740350877192982e-07,
"loss": 2.0847,
"step": 434
},
{
"epoch": 4.51948051948052,
"grad_norm": 0.4258849620819092,
"learning_rate": 3.7368421052631575e-07,
"loss": 2.082,
"step": 435
},
{
"epoch": 4.52987012987013,
"grad_norm": 0.3853350579738617,
"learning_rate": 3.7333333333333334e-07,
"loss": 2.0832,
"step": 436
},
{
"epoch": 4.54025974025974,
"grad_norm": 0.38020631670951843,
"learning_rate": 3.7298245614035083e-07,
"loss": 2.0799,
"step": 437
},
{
"epoch": 4.55064935064935,
"grad_norm": 0.4022679030895233,
"learning_rate": 3.726315789473684e-07,
"loss": 2.1038,
"step": 438
},
{
"epoch": 4.561038961038961,
"grad_norm": 0.37137728929519653,
"learning_rate": 3.7228070175438596e-07,
"loss": 2.0921,
"step": 439
},
{
"epoch": 4.571428571428571,
"grad_norm": 0.38251206278800964,
"learning_rate": 3.719298245614035e-07,
"loss": 2.0878,
"step": 440
},
{
"epoch": 4.581818181818182,
"grad_norm": 0.39200717210769653,
"learning_rate": 3.7157894736842104e-07,
"loss": 2.1202,
"step": 441
},
{
"epoch": 4.592207792207792,
"grad_norm": 0.3731335699558258,
"learning_rate": 3.7122807017543853e-07,
"loss": 2.0737,
"step": 442
},
{
"epoch": 4.602597402597403,
"grad_norm": 0.38276833295822144,
"learning_rate": 3.708771929824561e-07,
"loss": 2.0521,
"step": 443
},
{
"epoch": 4.612987012987013,
"grad_norm": 0.38775137066841125,
"learning_rate": 3.7052631578947367e-07,
"loss": 2.0772,
"step": 444
},
{
"epoch": 4.623376623376624,
"grad_norm": 0.3836955428123474,
"learning_rate": 3.701754385964912e-07,
"loss": 2.0992,
"step": 445
},
{
"epoch": 4.6337662337662335,
"grad_norm": 0.37715139985084534,
"learning_rate": 3.6982456140350875e-07,
"loss": 2.0499,
"step": 446
},
{
"epoch": 4.644155844155844,
"grad_norm": 0.3789008557796478,
"learning_rate": 3.6947368421052634e-07,
"loss": 2.0531,
"step": 447
},
{
"epoch": 4.654545454545454,
"grad_norm": 0.3865036964416504,
"learning_rate": 3.6912280701754383e-07,
"loss": 2.0949,
"step": 448
},
{
"epoch": 4.664935064935065,
"grad_norm": 0.3880210816860199,
"learning_rate": 3.687719298245614e-07,
"loss": 2.0871,
"step": 449
},
{
"epoch": 4.675324675324675,
"grad_norm": 0.3839876353740692,
"learning_rate": 3.684210526315789e-07,
"loss": 2.0586,
"step": 450
},
{
"epoch": 4.685714285714286,
"grad_norm": 0.39316463470458984,
"learning_rate": 3.6807017543859645e-07,
"loss": 2.0736,
"step": 451
},
{
"epoch": 4.696103896103896,
"grad_norm": 0.37328803539276123,
"learning_rate": 3.6771929824561405e-07,
"loss": 2.084,
"step": 452
},
{
"epoch": 4.706493506493507,
"grad_norm": 0.3884430527687073,
"learning_rate": 3.6736842105263153e-07,
"loss": 2.0788,
"step": 453
},
{
"epoch": 4.716883116883117,
"grad_norm": 0.385623574256897,
"learning_rate": 3.6701754385964913e-07,
"loss": 2.0705,
"step": 454
},
{
"epoch": 4.7272727272727275,
"grad_norm": 0.38950812816619873,
"learning_rate": 3.666666666666666e-07,
"loss": 2.0785,
"step": 455
},
{
"epoch": 4.7376623376623375,
"grad_norm": 0.38535040616989136,
"learning_rate": 3.663157894736842e-07,
"loss": 2.0909,
"step": 456
},
{
"epoch": 4.748051948051948,
"grad_norm": 0.3869593143463135,
"learning_rate": 3.6596491228070175e-07,
"loss": 2.0801,
"step": 457
},
{
"epoch": 4.758441558441558,
"grad_norm": 0.39084428548812866,
"learning_rate": 3.656140350877193e-07,
"loss": 2.096,
"step": 458
},
{
"epoch": 4.768831168831169,
"grad_norm": 0.3794546127319336,
"learning_rate": 3.6526315789473683e-07,
"loss": 2.0527,
"step": 459
},
{
"epoch": 4.779220779220779,
"grad_norm": 0.3870809078216553,
"learning_rate": 3.6491228070175437e-07,
"loss": 2.0853,
"step": 460
},
{
"epoch": 4.78961038961039,
"grad_norm": 0.38205036520957947,
"learning_rate": 3.645614035087719e-07,
"loss": 2.0643,
"step": 461
},
{
"epoch": 4.8,
"grad_norm": 0.3907061815261841,
"learning_rate": 3.6421052631578945e-07,
"loss": 2.0786,
"step": 462
},
{
"epoch": 4.810389610389611,
"grad_norm": 0.39493080973625183,
"learning_rate": 3.63859649122807e-07,
"loss": 2.0944,
"step": 463
},
{
"epoch": 4.820779220779221,
"grad_norm": 0.3930380046367645,
"learning_rate": 3.6350877192982453e-07,
"loss": 2.1138,
"step": 464
},
{
"epoch": 4.8311688311688314,
"grad_norm": 0.3952060639858246,
"learning_rate": 3.6315789473684213e-07,
"loss": 2.0802,
"step": 465
},
{
"epoch": 4.841558441558441,
"grad_norm": 0.3815995752811432,
"learning_rate": 3.628070175438596e-07,
"loss": 2.0838,
"step": 466
},
{
"epoch": 4.851948051948052,
"grad_norm": 0.38858020305633545,
"learning_rate": 3.6245614035087716e-07,
"loss": 2.0804,
"step": 467
},
{
"epoch": 4.862337662337662,
"grad_norm": 0.385565847158432,
"learning_rate": 3.6210526315789475e-07,
"loss": 2.0974,
"step": 468
},
{
"epoch": 4.872727272727273,
"grad_norm": 0.3909178078174591,
"learning_rate": 3.6175438596491224e-07,
"loss": 2.0887,
"step": 469
},
{
"epoch": 4.883116883116883,
"grad_norm": 0.3982325792312622,
"learning_rate": 3.6140350877192983e-07,
"loss": 2.1054,
"step": 470
},
{
"epoch": 4.893506493506494,
"grad_norm": 0.3876339793205261,
"learning_rate": 3.610526315789473e-07,
"loss": 2.1054,
"step": 471
},
{
"epoch": 4.903896103896104,
"grad_norm": 0.3819069266319275,
"learning_rate": 3.607017543859649e-07,
"loss": 2.0821,
"step": 472
},
{
"epoch": 4.914285714285715,
"grad_norm": 0.3924694359302521,
"learning_rate": 3.6035087719298245e-07,
"loss": 2.0712,
"step": 473
},
{
"epoch": 4.9246753246753245,
"grad_norm": 0.3937675654888153,
"learning_rate": 3.6e-07,
"loss": 2.1057,
"step": 474
},
{
"epoch": 4.935064935064935,
"grad_norm": 0.38620275259017944,
"learning_rate": 3.5964912280701754e-07,
"loss": 2.0845,
"step": 475
},
{
"epoch": 4.945454545454545,
"grad_norm": 0.40442150831222534,
"learning_rate": 3.59298245614035e-07,
"loss": 2.0936,
"step": 476
},
{
"epoch": 4.955844155844156,
"grad_norm": 0.3815317153930664,
"learning_rate": 3.589473684210526e-07,
"loss": 2.0845,
"step": 477
},
{
"epoch": 4.966233766233766,
"grad_norm": 0.38584476709365845,
"learning_rate": 3.5859649122807016e-07,
"loss": 2.071,
"step": 478
},
{
"epoch": 4.976623376623377,
"grad_norm": 0.3887505829334259,
"learning_rate": 3.582456140350877e-07,
"loss": 2.0924,
"step": 479
},
{
"epoch": 4.987012987012987,
"grad_norm": 0.3836219012737274,
"learning_rate": 3.5789473684210524e-07,
"loss": 2.0986,
"step": 480
},
{
"epoch": 4.997402597402598,
"grad_norm": 0.38430362939834595,
"learning_rate": 3.5754385964912283e-07,
"loss": 2.1006,
"step": 481
},
{
"epoch": 4.997402597402598,
"eval_loss": 2.0890417098999023,
"eval_runtime": 9.2277,
"eval_samples_per_second": 2.709,
"eval_steps_per_second": 0.433,
"step": 481
}
],
"logging_steps": 1,
"max_steps": 1500,
"num_input_tokens_seen": 0,
"num_train_epochs": 16,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.690178034414387e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}