aleegis12's picture
Training in progress, step 167, checkpoint
2320bb3 verified
{
"best_metric": 1.5042036771774292,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 3.009009009009009,
"eval_steps": 50,
"global_step": 167,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018018018018018018,
"grad_norm": 0.20094779133796692,
"learning_rate": 1e-05,
"loss": 1.7238,
"step": 1
},
{
"epoch": 0.018018018018018018,
"eval_loss": 2.4445245265960693,
"eval_runtime": 1.9421,
"eval_samples_per_second": 48.402,
"eval_steps_per_second": 12.358,
"step": 1
},
{
"epoch": 0.036036036036036036,
"grad_norm": 0.23159636557102203,
"learning_rate": 2e-05,
"loss": 2.3157,
"step": 2
},
{
"epoch": 0.05405405405405406,
"grad_norm": 0.17576082050800323,
"learning_rate": 3e-05,
"loss": 1.889,
"step": 3
},
{
"epoch": 0.07207207207207207,
"grad_norm": 0.18563885986804962,
"learning_rate": 4e-05,
"loss": 1.8117,
"step": 4
},
{
"epoch": 0.09009009009009009,
"grad_norm": 0.2837836742401123,
"learning_rate": 5e-05,
"loss": 2.5249,
"step": 5
},
{
"epoch": 0.10810810810810811,
"grad_norm": 0.5448350310325623,
"learning_rate": 6e-05,
"loss": 2.5175,
"step": 6
},
{
"epoch": 0.12612612612612611,
"grad_norm": 0.5622051358222961,
"learning_rate": 7e-05,
"loss": 2.6268,
"step": 7
},
{
"epoch": 0.14414414414414414,
"grad_norm": 0.5050106048583984,
"learning_rate": 8e-05,
"loss": 2.5345,
"step": 8
},
{
"epoch": 0.16216216216216217,
"grad_norm": 0.3459974229335785,
"learning_rate": 9e-05,
"loss": 2.6653,
"step": 9
},
{
"epoch": 0.18018018018018017,
"grad_norm": 0.2534170150756836,
"learning_rate": 0.0001,
"loss": 1.9698,
"step": 10
},
{
"epoch": 0.1981981981981982,
"grad_norm": 0.2756398916244507,
"learning_rate": 9.998999018714263e-05,
"loss": 1.7193,
"step": 11
},
{
"epoch": 0.21621621621621623,
"grad_norm": 0.3350951075553894,
"learning_rate": 9.995996475642466e-05,
"loss": 2.133,
"step": 12
},
{
"epoch": 0.23423423423423423,
"grad_norm": 0.5594307780265808,
"learning_rate": 9.990993572980378e-05,
"loss": 3.0643,
"step": 13
},
{
"epoch": 0.25225225225225223,
"grad_norm": 0.13822466135025024,
"learning_rate": 9.983992313852774e-05,
"loss": 1.7842,
"step": 14
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.14877331256866455,
"learning_rate": 9.974995501511404e-05,
"loss": 2.2152,
"step": 15
},
{
"epoch": 0.2882882882882883,
"grad_norm": 0.13884077966213226,
"learning_rate": 9.964006738212575e-05,
"loss": 1.8158,
"step": 16
},
{
"epoch": 0.3063063063063063,
"grad_norm": 0.17055541276931763,
"learning_rate": 9.951030423774859e-05,
"loss": 2.0037,
"step": 17
},
{
"epoch": 0.32432432432432434,
"grad_norm": 0.44902870059013367,
"learning_rate": 9.936071753817415e-05,
"loss": 2.1393,
"step": 18
},
{
"epoch": 0.34234234234234234,
"grad_norm": 0.4291950762271881,
"learning_rate": 9.919136717679722e-05,
"loss": 2.232,
"step": 19
},
{
"epoch": 0.36036036036036034,
"grad_norm": 0.46815672516822815,
"learning_rate": 9.900232096023477e-05,
"loss": 2.1575,
"step": 20
},
{
"epoch": 0.3783783783783784,
"grad_norm": 0.35431531071662903,
"learning_rate": 9.879365458117678e-05,
"loss": 2.1213,
"step": 21
},
{
"epoch": 0.3963963963963964,
"grad_norm": 0.2607969045639038,
"learning_rate": 9.856545158807938e-05,
"loss": 2.4743,
"step": 22
},
{
"epoch": 0.4144144144144144,
"grad_norm": 0.20863136649131775,
"learning_rate": 9.831780335171279e-05,
"loss": 1.7214,
"step": 23
},
{
"epoch": 0.43243243243243246,
"grad_norm": 0.31846967339515686,
"learning_rate": 9.805080902857699e-05,
"loss": 2.2591,
"step": 24
},
{
"epoch": 0.45045045045045046,
"grad_norm": 0.34938788414001465,
"learning_rate": 9.776457552120033e-05,
"loss": 2.2996,
"step": 25
},
{
"epoch": 0.46846846846846846,
"grad_norm": 0.6246058940887451,
"learning_rate": 9.745921743533653e-05,
"loss": 2.1906,
"step": 26
},
{
"epoch": 0.4864864864864865,
"grad_norm": 0.14733730256557465,
"learning_rate": 9.713485703407731e-05,
"loss": 1.6917,
"step": 27
},
{
"epoch": 0.5045045045045045,
"grad_norm": 0.2119920402765274,
"learning_rate": 9.679162418889931e-05,
"loss": 2.1286,
"step": 28
},
{
"epoch": 0.5225225225225225,
"grad_norm": 0.20382221043109894,
"learning_rate": 9.642965632766436e-05,
"loss": 1.889,
"step": 29
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.22297801077365875,
"learning_rate": 9.604909837959455e-05,
"loss": 2.1375,
"step": 30
},
{
"epoch": 0.5585585585585585,
"grad_norm": 0.29066169261932373,
"learning_rate": 9.565010271724352e-05,
"loss": 1.6588,
"step": 31
},
{
"epoch": 0.5765765765765766,
"grad_norm": 0.4789801836013794,
"learning_rate": 9.523282909548773e-05,
"loss": 1.7155,
"step": 32
},
{
"epoch": 0.5945945945945946,
"grad_norm": 0.5113940834999084,
"learning_rate": 9.47974445875617e-05,
"loss": 1.68,
"step": 33
},
{
"epoch": 0.6126126126126126,
"grad_norm": 0.5598991513252258,
"learning_rate": 9.434412351816328e-05,
"loss": 1.5835,
"step": 34
},
{
"epoch": 0.6306306306306306,
"grad_norm": 0.23446883261203766,
"learning_rate": 9.387304739365523e-05,
"loss": 2.0415,
"step": 35
},
{
"epoch": 0.6486486486486487,
"grad_norm": 0.23417960107326508,
"learning_rate": 9.338440482939146e-05,
"loss": 1.9709,
"step": 36
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.2393627017736435,
"learning_rate": 9.287839147419686e-05,
"loss": 1.809,
"step": 37
},
{
"epoch": 0.6846846846846847,
"grad_norm": 0.37448129057884216,
"learning_rate": 9.2355209932031e-05,
"loss": 2.3969,
"step": 38
},
{
"epoch": 0.7027027027027027,
"grad_norm": 0.5388697385787964,
"learning_rate": 9.181506968086697e-05,
"loss": 2.8039,
"step": 39
},
{
"epoch": 0.7207207207207207,
"grad_norm": 0.13199251890182495,
"learning_rate": 9.125818698881798e-05,
"loss": 1.6472,
"step": 40
},
{
"epoch": 0.7387387387387387,
"grad_norm": 0.21171270310878754,
"learning_rate": 9.068478482754532e-05,
"loss": 2.5439,
"step": 41
},
{
"epoch": 0.7567567567567568,
"grad_norm": 0.19285422563552856,
"learning_rate": 9.0095092782982e-05,
"loss": 1.7294,
"step": 42
},
{
"epoch": 0.7747747747747747,
"grad_norm": 0.19233770668506622,
"learning_rate": 8.948934696340843e-05,
"loss": 1.8547,
"step": 43
},
{
"epoch": 0.7927927927927928,
"grad_norm": 2.065861225128174,
"learning_rate": 8.886778990491631e-05,
"loss": 1.4244,
"step": 44
},
{
"epoch": 0.8108108108108109,
"grad_norm": 1.3515313863754272,
"learning_rate": 8.823067047429907e-05,
"loss": 1.3373,
"step": 45
},
{
"epoch": 0.8288288288288288,
"grad_norm": 0.7247907519340515,
"learning_rate": 8.757824376940746e-05,
"loss": 1.1633,
"step": 46
},
{
"epoch": 0.8468468468468469,
"grad_norm": 0.5490639209747314,
"learning_rate": 8.691077101701024e-05,
"loss": 1.3324,
"step": 47
},
{
"epoch": 0.8648648648648649,
"grad_norm": 0.2311074584722519,
"learning_rate": 8.622851946820095e-05,
"loss": 2.2242,
"step": 48
},
{
"epoch": 0.8828828828828829,
"grad_norm": 0.20029251277446747,
"learning_rate": 8.553176229139261e-05,
"loss": 1.7394,
"step": 49
},
{
"epoch": 0.9009009009009009,
"grad_norm": 0.27592265605926514,
"learning_rate": 8.482077846294308e-05,
"loss": 2.1438,
"step": 50
},
{
"epoch": 0.9009009009009009,
"eval_loss": 1.783769965171814,
"eval_runtime": 1.9447,
"eval_samples_per_second": 48.337,
"eval_steps_per_second": 12.341,
"step": 50
},
{
"epoch": 0.918918918918919,
"grad_norm": 0.3021310269832611,
"learning_rate": 8.409585265545509e-05,
"loss": 2.0702,
"step": 51
},
{
"epoch": 0.9369369369369369,
"grad_norm": 0.5802788734436035,
"learning_rate": 8.335727512379534e-05,
"loss": 2.8902,
"step": 52
},
{
"epoch": 0.954954954954955,
"grad_norm": 0.1312423199415207,
"learning_rate": 8.260534158887876e-05,
"loss": 1.7881,
"step": 53
},
{
"epoch": 0.972972972972973,
"grad_norm": 0.742950439453125,
"learning_rate": 8.184035311926396e-05,
"loss": 1.1818,
"step": 54
},
{
"epoch": 0.990990990990991,
"grad_norm": 0.3108900189399719,
"learning_rate": 8.106261601060772e-05,
"loss": 1.7755,
"step": 55
},
{
"epoch": 1.009009009009009,
"grad_norm": 0.5375937223434448,
"learning_rate": 8.027244166302642e-05,
"loss": 3.1039,
"step": 56
},
{
"epoch": 1.027027027027027,
"grad_norm": 0.21149364113807678,
"learning_rate": 7.947014645641379e-05,
"loss": 2.5015,
"step": 57
},
{
"epoch": 1.045045045045045,
"grad_norm": 0.1894480586051941,
"learning_rate": 7.865605162376486e-05,
"loss": 1.9007,
"step": 58
},
{
"epoch": 1.063063063063063,
"grad_norm": 0.20430035889148712,
"learning_rate": 7.783048312255653e-05,
"loss": 1.7878,
"step": 59
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.2826579511165619,
"learning_rate": 7.699377150423672e-05,
"loss": 1.5133,
"step": 60
},
{
"epoch": 1.0990990990990992,
"grad_norm": 0.7269498705863953,
"learning_rate": 7.614625178187402e-05,
"loss": 0.8433,
"step": 61
},
{
"epoch": 1.117117117117117,
"grad_norm": 0.6260727047920227,
"learning_rate": 7.528826329602099e-05,
"loss": 0.8916,
"step": 62
},
{
"epoch": 1.135135135135135,
"grad_norm": 0.5703617334365845,
"learning_rate": 7.442014957884472e-05,
"loss": 0.8438,
"step": 63
},
{
"epoch": 1.1531531531531531,
"grad_norm": 0.27178001403808594,
"learning_rate": 7.354225821657914e-05,
"loss": 2.1417,
"step": 64
},
{
"epoch": 1.1711711711711712,
"grad_norm": 0.23363694548606873,
"learning_rate": 7.265494071035401e-05,
"loss": 1.7944,
"step": 65
},
{
"epoch": 1.1891891891891893,
"grad_norm": 0.2847091257572174,
"learning_rate": 7.175855233545668e-05,
"loss": 1.751,
"step": 66
},
{
"epoch": 1.2072072072072073,
"grad_norm": 0.3543185293674469,
"learning_rate": 7.085345199908235e-05,
"loss": 2.0152,
"step": 67
},
{
"epoch": 1.2252252252252251,
"grad_norm": 0.5327386856079102,
"learning_rate": 6.994000209663036e-05,
"loss": 2.6897,
"step": 68
},
{
"epoch": 1.2432432432432432,
"grad_norm": 0.2581247091293335,
"learning_rate": 6.901856836660386e-05,
"loss": 1.2071,
"step": 69
},
{
"epoch": 1.2612612612612613,
"grad_norm": 0.18185599148273468,
"learning_rate": 6.808951974417078e-05,
"loss": 2.24,
"step": 70
},
{
"epoch": 1.2792792792792793,
"grad_norm": 0.21230140328407288,
"learning_rate": 6.715322821344494e-05,
"loss": 2.0791,
"step": 71
},
{
"epoch": 1.2972972972972974,
"grad_norm": 0.20122762024402618,
"learning_rate": 6.621006865854644e-05,
"loss": 1.6265,
"step": 72
},
{
"epoch": 1.3153153153153152,
"grad_norm": 0.22769460082054138,
"learning_rate": 6.526041871350086e-05,
"loss": 1.5291,
"step": 73
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.8807597160339355,
"learning_rate": 6.43046586110374e-05,
"loss": 0.7391,
"step": 74
},
{
"epoch": 1.3513513513513513,
"grad_norm": 1.10353422164917,
"learning_rate": 6.334317103034652e-05,
"loss": 0.6971,
"step": 75
},
{
"epoch": 1.3693693693693694,
"grad_norm": 0.6892307996749878,
"learning_rate": 6.237634094385813e-05,
"loss": 0.7755,
"step": 76
},
{
"epoch": 1.3873873873873874,
"grad_norm": 0.28367915749549866,
"learning_rate": 6.140455546310148e-05,
"loss": 2.0102,
"step": 77
},
{
"epoch": 1.4054054054054055,
"grad_norm": 0.354563444852829,
"learning_rate": 6.042820368370854e-05,
"loss": 2.0273,
"step": 78
},
{
"epoch": 1.4234234234234235,
"grad_norm": 0.2600463926792145,
"learning_rate": 5.944767652962309e-05,
"loss": 1.6145,
"step": 79
},
{
"epoch": 1.4414414414414414,
"grad_norm": 0.29232466220855713,
"learning_rate": 5.8463366596577706e-05,
"loss": 2.1392,
"step": 80
},
{
"epoch": 1.4594594594594594,
"grad_norm": 0.3904821276664734,
"learning_rate": 5.747566799490132e-05,
"loss": 2.3455,
"step": 81
},
{
"epoch": 1.4774774774774775,
"grad_norm": 0.3034067153930664,
"learning_rate": 5.648497619172042e-05,
"loss": 1.6627,
"step": 82
},
{
"epoch": 1.4954954954954955,
"grad_norm": 0.18810759484767914,
"learning_rate": 5.549168785261698e-05,
"loss": 2.3614,
"step": 83
},
{
"epoch": 1.5135135135135136,
"grad_norm": 0.15777312219142914,
"learning_rate": 5.4496200682806495e-05,
"loss": 1.697,
"step": 84
},
{
"epoch": 1.5315315315315314,
"grad_norm": 0.19415871798992157,
"learning_rate": 5.3498913267899864e-05,
"loss": 1.9134,
"step": 85
},
{
"epoch": 1.5495495495495497,
"grad_norm": 0.17604829370975494,
"learning_rate": 5.250022491431259e-05,
"loss": 1.8209,
"step": 86
},
{
"epoch": 1.5675675675675675,
"grad_norm": 1.056093692779541,
"learning_rate": 5.150053548938557e-05,
"loss": 0.6133,
"step": 87
},
{
"epoch": 1.5855855855855856,
"grad_norm": 0.8092790246009827,
"learning_rate": 5.0500245261281175e-05,
"loss": 0.7028,
"step": 88
},
{
"epoch": 1.6036036036036037,
"grad_norm": 0.6385818123817444,
"learning_rate": 4.949975473871884e-05,
"loss": 0.4955,
"step": 89
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.2813832759857178,
"learning_rate": 4.849946451061443e-05,
"loss": 1.7059,
"step": 90
},
{
"epoch": 1.6396396396396398,
"grad_norm": 0.25375688076019287,
"learning_rate": 4.749977508568742e-05,
"loss": 2.027,
"step": 91
},
{
"epoch": 1.6576576576576576,
"grad_norm": 0.28173476457595825,
"learning_rate": 4.650108673210015e-05,
"loss": 1.5781,
"step": 92
},
{
"epoch": 1.6756756756756757,
"grad_norm": 0.37018927931785583,
"learning_rate": 4.550379931719351e-05,
"loss": 1.9611,
"step": 93
},
{
"epoch": 1.6936936936936937,
"grad_norm": 0.44017812609672546,
"learning_rate": 4.4508312147383036e-05,
"loss": 1.9964,
"step": 94
},
{
"epoch": 1.7117117117117115,
"grad_norm": 0.27065154910087585,
"learning_rate": 4.3515023808279586e-05,
"loss": 1.7931,
"step": 95
},
{
"epoch": 1.7297297297297298,
"grad_norm": 0.1628153771162033,
"learning_rate": 4.252433200509869e-05,
"loss": 2.0541,
"step": 96
},
{
"epoch": 1.7477477477477477,
"grad_norm": 0.15087255835533142,
"learning_rate": 4.1536633403422306e-05,
"loss": 1.5446,
"step": 97
},
{
"epoch": 1.7657657657657657,
"grad_norm": 0.19169510900974274,
"learning_rate": 4.0552323470376916e-05,
"loss": 2.1779,
"step": 98
},
{
"epoch": 1.7837837837837838,
"grad_norm": 0.17723213136196136,
"learning_rate": 3.9571796316291476e-05,
"loss": 1.5049,
"step": 99
},
{
"epoch": 1.8018018018018018,
"grad_norm": 0.5622679591178894,
"learning_rate": 3.859544453689853e-05,
"loss": 0.538,
"step": 100
},
{
"epoch": 1.8018018018018018,
"eval_loss": 1.5607163906097412,
"eval_runtime": 1.8915,
"eval_samples_per_second": 49.696,
"eval_steps_per_second": 12.688,
"step": 100
},
{
"epoch": 1.8198198198198199,
"grad_norm": 0.459534227848053,
"learning_rate": 3.762365905614187e-05,
"loss": 0.6408,
"step": 101
},
{
"epoch": 1.8378378378378377,
"grad_norm": 0.45416250824928284,
"learning_rate": 3.665682896965349e-05,
"loss": 0.6204,
"step": 102
},
{
"epoch": 1.855855855855856,
"grad_norm": 0.3494967818260193,
"learning_rate": 3.5695341388962614e-05,
"loss": 0.8394,
"step": 103
},
{
"epoch": 1.8738738738738738,
"grad_norm": 0.20583558082580566,
"learning_rate": 3.473958128649915e-05,
"loss": 1.7962,
"step": 104
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.2505161762237549,
"learning_rate": 3.378993134145356e-05,
"loss": 1.8203,
"step": 105
},
{
"epoch": 1.90990990990991,
"grad_norm": 0.2894545793533325,
"learning_rate": 3.284677178655507e-05,
"loss": 1.7578,
"step": 106
},
{
"epoch": 1.9279279279279278,
"grad_norm": 0.36985740065574646,
"learning_rate": 3.1910480255829237e-05,
"loss": 2.3144,
"step": 107
},
{
"epoch": 1.945945945945946,
"grad_norm": 0.29496514797210693,
"learning_rate": 3.098143163339615e-05,
"loss": 2.2301,
"step": 108
},
{
"epoch": 1.9639639639639639,
"grad_norm": 0.13244111835956573,
"learning_rate": 3.0059997903369656e-05,
"loss": 1.2973,
"step": 109
},
{
"epoch": 1.981981981981982,
"grad_norm": 0.1934548020362854,
"learning_rate": 2.914654800091768e-05,
"loss": 1.0715,
"step": 110
},
{
"epoch": 2.0,
"grad_norm": 0.7730455994606018,
"learning_rate": 2.824144766454333e-05,
"loss": 2.955,
"step": 111
},
{
"epoch": 2.018018018018018,
"grad_norm": 0.14186394214630127,
"learning_rate": 2.7345059289646008e-05,
"loss": 1.6136,
"step": 112
},
{
"epoch": 2.036036036036036,
"grad_norm": 0.1974637508392334,
"learning_rate": 2.6457741783420886e-05,
"loss": 2.1953,
"step": 113
},
{
"epoch": 2.054054054054054,
"grad_norm": 0.15466494858264923,
"learning_rate": 2.5579850421155293e-05,
"loss": 1.5703,
"step": 114
},
{
"epoch": 2.0720720720720722,
"grad_norm": 0.17066383361816406,
"learning_rate": 2.4711736703979018e-05,
"loss": 1.8892,
"step": 115
},
{
"epoch": 2.09009009009009,
"grad_norm": 0.18141521513462067,
"learning_rate": 2.3853748218126e-05,
"loss": 0.7031,
"step": 116
},
{
"epoch": 2.108108108108108,
"grad_norm": 0.1847659945487976,
"learning_rate": 2.3006228495763295e-05,
"loss": 0.425,
"step": 117
},
{
"epoch": 2.126126126126126,
"grad_norm": 0.18299931287765503,
"learning_rate": 2.2169516877443485e-05,
"loss": 0.4302,
"step": 118
},
{
"epoch": 2.144144144144144,
"grad_norm": 0.1804618388414383,
"learning_rate": 2.1343948376235144e-05,
"loss": 1.5657,
"step": 119
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.22961750626564026,
"learning_rate": 2.052985354358622e-05,
"loss": 1.9153,
"step": 120
},
{
"epoch": 2.18018018018018,
"grad_norm": 0.24420714378356934,
"learning_rate": 1.9727558336973595e-05,
"loss": 1.726,
"step": 121
},
{
"epoch": 2.1981981981981984,
"grad_norm": 0.27104926109313965,
"learning_rate": 1.8937383989392294e-05,
"loss": 1.7018,
"step": 122
},
{
"epoch": 2.2162162162162162,
"grad_norm": 0.37170183658599854,
"learning_rate": 1.8159646880736036e-05,
"loss": 1.9319,
"step": 123
},
{
"epoch": 2.234234234234234,
"grad_norm": 0.4491060674190521,
"learning_rate": 1.739465841112125e-05,
"loss": 2.0411,
"step": 124
},
{
"epoch": 2.2522522522522523,
"grad_norm": 0.1254836916923523,
"learning_rate": 1.664272487620466e-05,
"loss": 1.4904,
"step": 125
},
{
"epoch": 2.27027027027027,
"grad_norm": 0.15240132808685303,
"learning_rate": 1.590414734454493e-05,
"loss": 2.1468,
"step": 126
},
{
"epoch": 2.2882882882882885,
"grad_norm": 0.16004954278469086,
"learning_rate": 1.517922153705692e-05,
"loss": 1.8539,
"step": 127
},
{
"epoch": 2.3063063063063063,
"grad_norm": 0.18988263607025146,
"learning_rate": 1.4468237708607397e-05,
"loss": 1.7977,
"step": 128
},
{
"epoch": 2.3243243243243246,
"grad_norm": 0.18853111565113068,
"learning_rate": 1.3771480531799052e-05,
"loss": 1.6049,
"step": 129
},
{
"epoch": 2.3423423423423424,
"grad_norm": 0.1556941568851471,
"learning_rate": 1.308922898298977e-05,
"loss": 0.3974,
"step": 130
},
{
"epoch": 2.3603603603603602,
"grad_norm": 0.17364652454853058,
"learning_rate": 1.2421756230592534e-05,
"loss": 0.4047,
"step": 131
},
{
"epoch": 2.3783783783783785,
"grad_norm": 0.15375109016895294,
"learning_rate": 1.1769329525700935e-05,
"loss": 0.443,
"step": 132
},
{
"epoch": 2.3963963963963963,
"grad_norm": 0.2470708191394806,
"learning_rate": 1.1132210095083694e-05,
"loss": 1.5159,
"step": 133
},
{
"epoch": 2.4144144144144146,
"grad_norm": 0.21014344692230225,
"learning_rate": 1.0510653036591583e-05,
"loss": 1.8404,
"step": 134
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.23266132175922394,
"learning_rate": 9.904907217018e-06,
"loss": 1.3737,
"step": 135
},
{
"epoch": 2.4504504504504503,
"grad_norm": 0.32797127962112427,
"learning_rate": 9.31521517245469e-06,
"loss": 2.1399,
"step": 136
},
{
"epoch": 2.4684684684684686,
"grad_norm": 0.4745734930038452,
"learning_rate": 8.741813011182014e-06,
"loss": 2.2424,
"step": 137
},
{
"epoch": 2.4864864864864864,
"grad_norm": 0.13079120218753815,
"learning_rate": 8.18493031913305e-06,
"loss": 1.5224,
"step": 138
},
{
"epoch": 2.5045045045045047,
"grad_norm": 0.1728202998638153,
"learning_rate": 7.644790067969005e-06,
"loss": 2.0656,
"step": 139
},
{
"epoch": 2.5225225225225225,
"grad_norm": 0.20250830054283142,
"learning_rate": 7.1216085258031414e-06,
"loss": 2.0862,
"step": 140
},
{
"epoch": 2.5405405405405403,
"grad_norm": 0.16197475790977478,
"learning_rate": 6.6155951706085405e-06,
"loss": 1.4935,
"step": 141
},
{
"epoch": 2.5585585585585586,
"grad_norm": 0.15914876759052277,
"learning_rate": 6.1269526063447765e-06,
"loss": 0.5048,
"step": 142
},
{
"epoch": 2.5765765765765765,
"grad_norm": 0.18777939677238464,
"learning_rate": 5.6558764818367195e-06,
"loss": 0.6094,
"step": 143
},
{
"epoch": 2.5945945945945947,
"grad_norm": 0.14933425188064575,
"learning_rate": 5.2025554124383095e-06,
"loss": 0.4805,
"step": 144
},
{
"epoch": 2.6126126126126126,
"grad_norm": 0.1998123824596405,
"learning_rate": 4.767170904512292e-06,
"loss": 1.3946,
"step": 145
},
{
"epoch": 2.6306306306306304,
"grad_norm": 0.270035982131958,
"learning_rate": 4.349897282756487e-06,
"loss": 2.0762,
"step": 146
},
{
"epoch": 2.6486486486486487,
"grad_norm": 0.22045235335826874,
"learning_rate": 3.95090162040545e-06,
"loss": 1.8445,
"step": 147
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.25569671392440796,
"learning_rate": 3.570343672335641e-06,
"loss": 1.7808,
"step": 148
},
{
"epoch": 2.684684684684685,
"grad_norm": 0.3833853602409363,
"learning_rate": 3.2083758111006945e-06,
"loss": 1.7097,
"step": 149
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.46845442056655884,
"learning_rate": 2.86514296592269e-06,
"loss": 2.2938,
"step": 150
},
{
"epoch": 2.7027027027027026,
"eval_loss": 1.5042036771774292,
"eval_runtime": 1.8931,
"eval_samples_per_second": 49.654,
"eval_steps_per_second": 12.678,
"step": 150
},
{
"epoch": 2.7207207207207205,
"grad_norm": 0.1713583618402481,
"learning_rate": 2.54078256466348e-06,
"loss": 1.8336,
"step": 151
},
{
"epoch": 2.7387387387387387,
"grad_norm": 0.17491984367370605,
"learning_rate": 2.2354244787996748e-06,
"loss": 2.1273,
"step": 152
},
{
"epoch": 2.756756756756757,
"grad_norm": 0.1518978625535965,
"learning_rate": 1.9491909714230204e-06,
"loss": 1.8808,
"step": 153
},
{
"epoch": 2.774774774774775,
"grad_norm": 0.16476936638355255,
"learning_rate": 1.6821966482872264e-06,
"loss": 1.7178,
"step": 154
},
{
"epoch": 2.7927927927927927,
"grad_norm": 0.16660767793655396,
"learning_rate": 1.434548411920622e-06,
"loss": 1.1728,
"step": 155
},
{
"epoch": 2.810810810810811,
"grad_norm": 0.15310387313365936,
"learning_rate": 1.206345418823235e-06,
"loss": 0.4533,
"step": 156
},
{
"epoch": 2.828828828828829,
"grad_norm": 0.16932179033756256,
"learning_rate": 9.976790397652315e-07,
"loss": 0.4293,
"step": 157
},
{
"epoch": 2.846846846846847,
"grad_norm": 0.2117478847503662,
"learning_rate": 8.086328232027873e-07,
"loss": 0.5155,
"step": 158
},
{
"epoch": 2.864864864864865,
"grad_norm": 0.22566819190979004,
"learning_rate": 6.392824618258519e-07,
"loss": 1.8127,
"step": 159
},
{
"epoch": 2.8828828828828827,
"grad_norm": 0.2261633276939392,
"learning_rate": 4.896957622514298e-07,
"loss": 1.6093,
"step": 160
},
{
"epoch": 2.900900900900901,
"grad_norm": 0.2446056753396988,
"learning_rate": 3.599326178742535e-07,
"loss": 1.5191,
"step": 161
},
{
"epoch": 2.918918918918919,
"grad_norm": 0.3451971411705017,
"learning_rate": 2.500449848859776e-07,
"loss": 1.9343,
"step": 162
},
{
"epoch": 2.936936936936937,
"grad_norm": 0.46758314967155457,
"learning_rate": 1.6007686147225254e-07,
"loss": 2.3823,
"step": 163
},
{
"epoch": 2.954954954954955,
"grad_norm": 0.15292246639728546,
"learning_rate": 9.006427019622176e-08,
"loss": 1.9701,
"step": 164
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.1602296382188797,
"learning_rate": 4.0035243575342605e-08,
"loss": 0.7246,
"step": 165
},
{
"epoch": 2.990990990990991,
"grad_norm": 0.2040543258190155,
"learning_rate": 1.0009812857370016e-08,
"loss": 1.3962,
"step": 166
},
{
"epoch": 3.009009009009009,
"grad_norm": 0.4154304265975952,
"learning_rate": 0.0,
"loss": 2.7024,
"step": 167
}
],
"logging_steps": 1,
"max_steps": 167,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.268381537599488e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}