|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9430784776018861, |
|
"eval_steps": 100, |
|
"global_step": 1400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006736274840013472, |
|
"grad_norm": 3.42309308052063, |
|
"learning_rate": 1.0067114093959731e-07, |
|
"loss": 0.4257, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0013472549680026945, |
|
"grad_norm": 3.701201915740967, |
|
"learning_rate": 2.0134228187919462e-07, |
|
"loss": 0.4285, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0020208824520040417, |
|
"grad_norm": 4.045602321624756, |
|
"learning_rate": 3.0201342281879193e-07, |
|
"loss": 0.4232, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.002694509936005389, |
|
"grad_norm": 3.859919786453247, |
|
"learning_rate": 4.0268456375838924e-07, |
|
"loss": 0.4029, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.003368137420006736, |
|
"grad_norm": 4.171447277069092, |
|
"learning_rate": 5.033557046979866e-07, |
|
"loss": 0.4158, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0040417649040080834, |
|
"grad_norm": 3.556626796722412, |
|
"learning_rate": 6.040268456375839e-07, |
|
"loss": 0.3945, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.004715392388009431, |
|
"grad_norm": 3.78082537651062, |
|
"learning_rate": 7.046979865771813e-07, |
|
"loss": 0.372, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.005389019872010778, |
|
"grad_norm": 3.459005355834961, |
|
"learning_rate": 8.053691275167785e-07, |
|
"loss": 0.392, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.006062647356012125, |
|
"grad_norm": 3.6338694095611572, |
|
"learning_rate": 9.060402684563759e-07, |
|
"loss": 0.4221, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.006736274840013472, |
|
"grad_norm": 3.6951706409454346, |
|
"learning_rate": 1.006711409395973e-06, |
|
"loss": 0.4504, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00740990232401482, |
|
"grad_norm": 2.708463668823242, |
|
"learning_rate": 1.1073825503355705e-06, |
|
"loss": 0.3848, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.008083529808016167, |
|
"grad_norm": 2.9015040397644043, |
|
"learning_rate": 1.2080536912751677e-06, |
|
"loss": 0.3936, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.008757157292017514, |
|
"grad_norm": 3.133338212966919, |
|
"learning_rate": 1.3087248322147651e-06, |
|
"loss": 0.3844, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.009430784776018861, |
|
"grad_norm": 2.1770880222320557, |
|
"learning_rate": 1.4093959731543626e-06, |
|
"loss": 0.3474, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.010104412260020209, |
|
"grad_norm": 1.66121244430542, |
|
"learning_rate": 1.5100671140939598e-06, |
|
"loss": 0.3646, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.010778039744021556, |
|
"grad_norm": 1.725306510925293, |
|
"learning_rate": 1.610738255033557e-06, |
|
"loss": 0.3739, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.011451667228022903, |
|
"grad_norm": 1.5393097400665283, |
|
"learning_rate": 1.7114093959731544e-06, |
|
"loss": 0.2967, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01212529471202425, |
|
"grad_norm": 1.6653029918670654, |
|
"learning_rate": 1.8120805369127518e-06, |
|
"loss": 0.3491, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.012798922196025598, |
|
"grad_norm": 1.4329285621643066, |
|
"learning_rate": 1.912751677852349e-06, |
|
"loss": 0.3438, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.013472549680026945, |
|
"grad_norm": 1.1590880155563354, |
|
"learning_rate": 2.013422818791946e-06, |
|
"loss": 0.2907, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.014146177164028292, |
|
"grad_norm": 1.4018336534500122, |
|
"learning_rate": 2.1140939597315434e-06, |
|
"loss": 0.3504, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.01481980464802964, |
|
"grad_norm": 1.201278805732727, |
|
"learning_rate": 2.214765100671141e-06, |
|
"loss": 0.3176, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.015493432132030987, |
|
"grad_norm": 1.14249849319458, |
|
"learning_rate": 2.3154362416107382e-06, |
|
"loss": 0.3079, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.016167059616032334, |
|
"grad_norm": 1.0337632894515991, |
|
"learning_rate": 2.4161073825503354e-06, |
|
"loss": 0.3039, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.016840687100033683, |
|
"grad_norm": 0.9944117665290833, |
|
"learning_rate": 2.516778523489933e-06, |
|
"loss": 0.297, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.017514314584035028, |
|
"grad_norm": 0.946663498878479, |
|
"learning_rate": 2.6174496644295303e-06, |
|
"loss": 0.3315, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.018187942068036377, |
|
"grad_norm": 1.056069254875183, |
|
"learning_rate": 2.7181208053691275e-06, |
|
"loss": 0.3274, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.018861569552037723, |
|
"grad_norm": 0.9784092903137207, |
|
"learning_rate": 2.818791946308725e-06, |
|
"loss": 0.3399, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.01953519703603907, |
|
"grad_norm": 1.07163667678833, |
|
"learning_rate": 2.9194630872483223e-06, |
|
"loss": 0.3361, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.020208824520040417, |
|
"grad_norm": 0.9870592951774597, |
|
"learning_rate": 3.0201342281879195e-06, |
|
"loss": 0.3026, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.020882452004041766, |
|
"grad_norm": 0.9180539846420288, |
|
"learning_rate": 3.120805369127517e-06, |
|
"loss": 0.2716, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.02155607948804311, |
|
"grad_norm": 0.8827613592147827, |
|
"learning_rate": 3.221476510067114e-06, |
|
"loss": 0.2623, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.02222970697204446, |
|
"grad_norm": 0.8390945196151733, |
|
"learning_rate": 3.3221476510067116e-06, |
|
"loss": 0.2792, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.022903334456045806, |
|
"grad_norm": 0.8577262163162231, |
|
"learning_rate": 3.4228187919463088e-06, |
|
"loss": 0.2906, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.023576961940047155, |
|
"grad_norm": 0.7939577102661133, |
|
"learning_rate": 3.523489932885906e-06, |
|
"loss": 0.2569, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0242505894240485, |
|
"grad_norm": 0.8591914772987366, |
|
"learning_rate": 3.6241610738255036e-06, |
|
"loss": 0.3072, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.02492421690804985, |
|
"grad_norm": 0.8437011241912842, |
|
"learning_rate": 3.724832214765101e-06, |
|
"loss": 0.3002, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.025597844392051195, |
|
"grad_norm": 0.8370192646980286, |
|
"learning_rate": 3.825503355704698e-06, |
|
"loss": 0.2693, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.026271471876052544, |
|
"grad_norm": 0.7814688086509705, |
|
"learning_rate": 3.926174496644295e-06, |
|
"loss": 0.2816, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.02694509936005389, |
|
"grad_norm": 0.8348170518875122, |
|
"learning_rate": 4.026845637583892e-06, |
|
"loss": 0.251, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02761872684405524, |
|
"grad_norm": 0.7987892627716064, |
|
"learning_rate": 4.12751677852349e-06, |
|
"loss": 0.2637, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.028292354328056584, |
|
"grad_norm": 0.8840119242668152, |
|
"learning_rate": 4.228187919463087e-06, |
|
"loss": 0.315, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.028965981812057933, |
|
"grad_norm": 0.7633718848228455, |
|
"learning_rate": 4.328859060402685e-06, |
|
"loss": 0.2988, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.02963960929605928, |
|
"grad_norm": 0.7476988434791565, |
|
"learning_rate": 4.429530201342282e-06, |
|
"loss": 0.2856, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.030313236780060628, |
|
"grad_norm": 0.7812101244926453, |
|
"learning_rate": 4.530201342281879e-06, |
|
"loss": 0.2622, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.030986864264061973, |
|
"grad_norm": 0.7842202186584473, |
|
"learning_rate": 4.6308724832214765e-06, |
|
"loss": 0.3366, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03166049174806332, |
|
"grad_norm": 0.7462322115898132, |
|
"learning_rate": 4.731543624161074e-06, |
|
"loss": 0.2825, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.03233411923206467, |
|
"grad_norm": 0.7542632818222046, |
|
"learning_rate": 4.832214765100671e-06, |
|
"loss": 0.2549, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.033007746716066017, |
|
"grad_norm": 0.7200729250907898, |
|
"learning_rate": 4.932885906040269e-06, |
|
"loss": 0.2593, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.033681374200067365, |
|
"grad_norm": 0.8686407208442688, |
|
"learning_rate": 5.033557046979866e-06, |
|
"loss": 0.2828, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03435500168406871, |
|
"grad_norm": 0.734254002571106, |
|
"learning_rate": 5.134228187919463e-06, |
|
"loss": 0.2841, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.035028629168070056, |
|
"grad_norm": 0.7483602166175842, |
|
"learning_rate": 5.2348993288590606e-06, |
|
"loss": 0.2956, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.035702256652071405, |
|
"grad_norm": 0.7722125053405762, |
|
"learning_rate": 5.335570469798658e-06, |
|
"loss": 0.2817, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.036375884136072754, |
|
"grad_norm": 0.7247833609580994, |
|
"learning_rate": 5.436241610738255e-06, |
|
"loss": 0.2589, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.037049511620074096, |
|
"grad_norm": 0.8258161544799805, |
|
"learning_rate": 5.536912751677853e-06, |
|
"loss": 0.2929, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.037723139104075445, |
|
"grad_norm": 0.784130334854126, |
|
"learning_rate": 5.63758389261745e-06, |
|
"loss": 0.2639, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.038396766588076794, |
|
"grad_norm": 0.8519976735115051, |
|
"learning_rate": 5.738255033557047e-06, |
|
"loss": 0.2611, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.03907039407207814, |
|
"grad_norm": 0.7617088556289673, |
|
"learning_rate": 5.838926174496645e-06, |
|
"loss": 0.3038, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.039744021556079485, |
|
"grad_norm": 0.7174592018127441, |
|
"learning_rate": 5.939597315436242e-06, |
|
"loss": 0.2451, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.040417649040080834, |
|
"grad_norm": 0.7933776378631592, |
|
"learning_rate": 6.040268456375839e-06, |
|
"loss": 0.2979, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04109127652408218, |
|
"grad_norm": 0.7308351993560791, |
|
"learning_rate": 6.140939597315437e-06, |
|
"loss": 0.2547, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.04176490400808353, |
|
"grad_norm": 0.8782221674919128, |
|
"learning_rate": 6.241610738255034e-06, |
|
"loss": 0.2948, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.042438531492084874, |
|
"grad_norm": 0.7220450043678284, |
|
"learning_rate": 6.342281879194631e-06, |
|
"loss": 0.2397, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.04311215897608622, |
|
"grad_norm": 0.8042862415313721, |
|
"learning_rate": 6.442953020134228e-06, |
|
"loss": 0.2963, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.04378578646008757, |
|
"grad_norm": 0.6996918320655823, |
|
"learning_rate": 6.543624161073825e-06, |
|
"loss": 0.2542, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04445941394408892, |
|
"grad_norm": 0.7606627941131592, |
|
"learning_rate": 6.644295302013423e-06, |
|
"loss": 0.285, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.04513304142809026, |
|
"grad_norm": 0.8591688275337219, |
|
"learning_rate": 6.74496644295302e-06, |
|
"loss": 0.2671, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.04580666891209161, |
|
"grad_norm": 0.8488709330558777, |
|
"learning_rate": 6.8456375838926175e-06, |
|
"loss": 0.2751, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.04648029639609296, |
|
"grad_norm": 0.7567676305770874, |
|
"learning_rate": 6.946308724832215e-06, |
|
"loss": 0.301, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.04715392388009431, |
|
"grad_norm": 0.7121560573577881, |
|
"learning_rate": 7.046979865771812e-06, |
|
"loss": 0.2557, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04782755136409565, |
|
"grad_norm": 0.7666682600975037, |
|
"learning_rate": 7.147651006711409e-06, |
|
"loss": 0.2582, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.048501178848097, |
|
"grad_norm": 0.7414038181304932, |
|
"learning_rate": 7.248322147651007e-06, |
|
"loss": 0.262, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.04917480633209835, |
|
"grad_norm": 0.8357811570167542, |
|
"learning_rate": 7.348993288590604e-06, |
|
"loss": 0.2591, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0498484338160997, |
|
"grad_norm": 0.7933549880981445, |
|
"learning_rate": 7.449664429530202e-06, |
|
"loss": 0.282, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.05052206130010104, |
|
"grad_norm": 0.7420201301574707, |
|
"learning_rate": 7.5503355704698e-06, |
|
"loss": 0.2469, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.05119568878410239, |
|
"grad_norm": 0.7670828104019165, |
|
"learning_rate": 7.651006711409396e-06, |
|
"loss": 0.295, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.05186931626810374, |
|
"grad_norm": 0.722752571105957, |
|
"learning_rate": 7.751677852348993e-06, |
|
"loss": 0.2301, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.05254294375210509, |
|
"grad_norm": 0.7430191040039062, |
|
"learning_rate": 7.85234899328859e-06, |
|
"loss": 0.2875, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.05321657123610643, |
|
"grad_norm": 0.6979767084121704, |
|
"learning_rate": 7.953020134228188e-06, |
|
"loss": 0.2326, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.05389019872010778, |
|
"grad_norm": 0.7197319269180298, |
|
"learning_rate": 8.053691275167785e-06, |
|
"loss": 0.2413, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05456382620410913, |
|
"grad_norm": 0.7689131498336792, |
|
"learning_rate": 8.154362416107382e-06, |
|
"loss": 0.2868, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.05523745368811048, |
|
"grad_norm": 0.7233304381370544, |
|
"learning_rate": 8.25503355704698e-06, |
|
"loss": 0.2586, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.05591108117211182, |
|
"grad_norm": 0.8464373350143433, |
|
"learning_rate": 8.355704697986576e-06, |
|
"loss": 0.2998, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.05658470865611317, |
|
"grad_norm": 0.8020244240760803, |
|
"learning_rate": 8.456375838926174e-06, |
|
"loss": 0.3323, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.05725833614011452, |
|
"grad_norm": 0.9260913729667664, |
|
"learning_rate": 8.55704697986577e-06, |
|
"loss": 0.3353, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.057931963624115866, |
|
"grad_norm": 0.824252188205719, |
|
"learning_rate": 8.65771812080537e-06, |
|
"loss": 0.2778, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.05860559110811721, |
|
"grad_norm": 0.7277565598487854, |
|
"learning_rate": 8.758389261744967e-06, |
|
"loss": 0.2863, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.05927921859211856, |
|
"grad_norm": 0.7575395107269287, |
|
"learning_rate": 8.859060402684564e-06, |
|
"loss": 0.2192, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.059952846076119906, |
|
"grad_norm": 0.7741091251373291, |
|
"learning_rate": 8.959731543624161e-06, |
|
"loss": 0.2808, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.060626473560121255, |
|
"grad_norm": 0.7291881442070007, |
|
"learning_rate": 9.060402684563759e-06, |
|
"loss": 0.2624, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0613001010441226, |
|
"grad_norm": 0.7662385106086731, |
|
"learning_rate": 9.161073825503356e-06, |
|
"loss": 0.2803, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.061973728528123946, |
|
"grad_norm": 0.7009522914886475, |
|
"learning_rate": 9.261744966442953e-06, |
|
"loss": 0.26, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.06264735601212529, |
|
"grad_norm": 0.8707520365715027, |
|
"learning_rate": 9.36241610738255e-06, |
|
"loss": 0.3179, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.06332098349612664, |
|
"grad_norm": 0.8629103302955627, |
|
"learning_rate": 9.463087248322147e-06, |
|
"loss": 0.3065, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.06399461098012799, |
|
"grad_norm": 0.8592970371246338, |
|
"learning_rate": 9.563758389261745e-06, |
|
"loss": 0.2574, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.06466823846412934, |
|
"grad_norm": 0.8038861751556396, |
|
"learning_rate": 9.664429530201342e-06, |
|
"loss": 0.2699, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.06534186594813068, |
|
"grad_norm": 0.7168505787849426, |
|
"learning_rate": 9.765100671140939e-06, |
|
"loss": 0.2507, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.06601549343213203, |
|
"grad_norm": 0.7545929551124573, |
|
"learning_rate": 9.865771812080538e-06, |
|
"loss": 0.2922, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.06668912091613338, |
|
"grad_norm": 0.7718814611434937, |
|
"learning_rate": 9.966442953020135e-06, |
|
"loss": 0.254, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.06736274840013473, |
|
"grad_norm": 0.8245450854301453, |
|
"learning_rate": 1.0067114093959732e-05, |
|
"loss": 0.2869, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06736274840013473, |
|
"eval_loss": 0.2735002040863037, |
|
"eval_runtime": 104.2064, |
|
"eval_samples_per_second": 47.982, |
|
"eval_steps_per_second": 3.004, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06803637588413607, |
|
"grad_norm": 0.9367719888687134, |
|
"learning_rate": 1.016778523489933e-05, |
|
"loss": 0.3273, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.06871000336813742, |
|
"grad_norm": 0.7697410583496094, |
|
"learning_rate": 1.0268456375838927e-05, |
|
"loss": 0.2493, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.06938363085213876, |
|
"grad_norm": 0.7449803948402405, |
|
"learning_rate": 1.0369127516778524e-05, |
|
"loss": 0.2558, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.07005725833614011, |
|
"grad_norm": 0.7809726595878601, |
|
"learning_rate": 1.0469798657718121e-05, |
|
"loss": 0.3079, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.07073088582014146, |
|
"grad_norm": 0.8014216423034668, |
|
"learning_rate": 1.0570469798657718e-05, |
|
"loss": 0.2774, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.07140451330414281, |
|
"grad_norm": 0.7782856225967407, |
|
"learning_rate": 1.0671140939597316e-05, |
|
"loss": 0.2874, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.07207814078814416, |
|
"grad_norm": 0.7489345669746399, |
|
"learning_rate": 1.0771812080536913e-05, |
|
"loss": 0.2618, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.07275176827214551, |
|
"grad_norm": 0.7881894111633301, |
|
"learning_rate": 1.087248322147651e-05, |
|
"loss": 0.2914, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.07342539575614684, |
|
"grad_norm": 0.8149965405464172, |
|
"learning_rate": 1.0973154362416109e-05, |
|
"loss": 0.2904, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.07409902324014819, |
|
"grad_norm": 0.8088157176971436, |
|
"learning_rate": 1.1073825503355706e-05, |
|
"loss": 0.3084, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07477265072414954, |
|
"grad_norm": 0.804861843585968, |
|
"learning_rate": 1.1174496644295303e-05, |
|
"loss": 0.2919, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.07544627820815089, |
|
"grad_norm": 0.7035599946975708, |
|
"learning_rate": 1.12751677852349e-05, |
|
"loss": 0.2971, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.07611990569215224, |
|
"grad_norm": 0.8036991357803345, |
|
"learning_rate": 1.1375838926174498e-05, |
|
"loss": 0.2857, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.07679353317615359, |
|
"grad_norm": 0.6793683767318726, |
|
"learning_rate": 1.1476510067114095e-05, |
|
"loss": 0.2742, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.07746716066015494, |
|
"grad_norm": 0.7865248918533325, |
|
"learning_rate": 1.1577181208053692e-05, |
|
"loss": 0.3156, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.07814078814415629, |
|
"grad_norm": 0.6990460157394409, |
|
"learning_rate": 1.167785234899329e-05, |
|
"loss": 0.281, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.07881441562815762, |
|
"grad_norm": 0.7218809723854065, |
|
"learning_rate": 1.1778523489932886e-05, |
|
"loss": 0.2584, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.07948804311215897, |
|
"grad_norm": 0.6970985531806946, |
|
"learning_rate": 1.1879194630872484e-05, |
|
"loss": 0.2438, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.08016167059616032, |
|
"grad_norm": 0.7687243819236755, |
|
"learning_rate": 1.1979865771812081e-05, |
|
"loss": 0.2846, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.08083529808016167, |
|
"grad_norm": 0.6929764151573181, |
|
"learning_rate": 1.2080536912751678e-05, |
|
"loss": 0.2611, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.08150892556416302, |
|
"grad_norm": 0.729848325252533, |
|
"learning_rate": 1.2181208053691277e-05, |
|
"loss": 0.3007, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.08218255304816437, |
|
"grad_norm": 0.7301985025405884, |
|
"learning_rate": 1.2281879194630874e-05, |
|
"loss": 0.2847, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.08285618053216572, |
|
"grad_norm": 0.7333296537399292, |
|
"learning_rate": 1.2382550335570471e-05, |
|
"loss": 0.2674, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.08352980801616706, |
|
"grad_norm": 0.7411990165710449, |
|
"learning_rate": 1.2483221476510069e-05, |
|
"loss": 0.2777, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.08420343550016841, |
|
"grad_norm": 0.6465498805046082, |
|
"learning_rate": 1.2583892617449664e-05, |
|
"loss": 0.2579, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.08487706298416975, |
|
"grad_norm": 0.6950599551200867, |
|
"learning_rate": 1.2684563758389261e-05, |
|
"loss": 0.3164, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.0855506904681711, |
|
"grad_norm": 0.6696597337722778, |
|
"learning_rate": 1.2785234899328858e-05, |
|
"loss": 0.2564, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.08622431795217245, |
|
"grad_norm": 0.6537868976593018, |
|
"learning_rate": 1.2885906040268456e-05, |
|
"loss": 0.2375, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.0868979454361738, |
|
"grad_norm": 0.7363224029541016, |
|
"learning_rate": 1.2986577181208053e-05, |
|
"loss": 0.2589, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.08757157292017514, |
|
"grad_norm": 0.7354284524917603, |
|
"learning_rate": 1.308724832214765e-05, |
|
"loss": 0.3049, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0882452004041765, |
|
"grad_norm": 0.6521575450897217, |
|
"learning_rate": 1.3187919463087247e-05, |
|
"loss": 0.2385, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.08891882788817784, |
|
"grad_norm": 0.6530443429946899, |
|
"learning_rate": 1.3288590604026846e-05, |
|
"loss": 0.2588, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.08959245537217919, |
|
"grad_norm": 0.7331404089927673, |
|
"learning_rate": 1.3389261744966443e-05, |
|
"loss": 0.3061, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.09026608285618053, |
|
"grad_norm": 0.7427138090133667, |
|
"learning_rate": 1.348993288590604e-05, |
|
"loss": 0.3513, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.09093971034018188, |
|
"grad_norm": 0.6774203181266785, |
|
"learning_rate": 1.3590604026845638e-05, |
|
"loss": 0.2639, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.09161333782418322, |
|
"grad_norm": 0.6679060459136963, |
|
"learning_rate": 1.3691275167785235e-05, |
|
"loss": 0.2503, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.09228696530818457, |
|
"grad_norm": 0.6390411853790283, |
|
"learning_rate": 1.3791946308724832e-05, |
|
"loss": 0.2298, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.09296059279218592, |
|
"grad_norm": 0.7115532159805298, |
|
"learning_rate": 1.389261744966443e-05, |
|
"loss": 0.255, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.09363422027618727, |
|
"grad_norm": 0.6546367406845093, |
|
"learning_rate": 1.3993288590604027e-05, |
|
"loss": 0.2623, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.09430784776018862, |
|
"grad_norm": 0.7526003122329712, |
|
"learning_rate": 1.4093959731543624e-05, |
|
"loss": 0.2701, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09498147524418997, |
|
"grad_norm": 0.7417687773704529, |
|
"learning_rate": 1.4194630872483221e-05, |
|
"loss": 0.2488, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.0956551027281913, |
|
"grad_norm": 0.6994727849960327, |
|
"learning_rate": 1.4295302013422818e-05, |
|
"loss": 0.2861, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.09632873021219265, |
|
"grad_norm": 0.7503766417503357, |
|
"learning_rate": 1.4395973154362415e-05, |
|
"loss": 0.3002, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.097002357696194, |
|
"grad_norm": 0.6777353882789612, |
|
"learning_rate": 1.4496644295302014e-05, |
|
"loss": 0.2548, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.09767598518019535, |
|
"grad_norm": 0.8131176829338074, |
|
"learning_rate": 1.4597315436241612e-05, |
|
"loss": 0.2736, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.0983496126641967, |
|
"grad_norm": 0.6841787099838257, |
|
"learning_rate": 1.4697986577181209e-05, |
|
"loss": 0.2647, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.09902324014819805, |
|
"grad_norm": 0.673572838306427, |
|
"learning_rate": 1.4798657718120806e-05, |
|
"loss": 0.2414, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.0996968676321994, |
|
"grad_norm": 0.6950225234031677, |
|
"learning_rate": 1.4899328859060403e-05, |
|
"loss": 0.268, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.10037049511620075, |
|
"grad_norm": 0.7058023810386658, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.2682, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.10104412260020208, |
|
"grad_norm": 0.7642398476600647, |
|
"learning_rate": 1.4999979233262118e-05, |
|
"loss": 0.2871, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.10171775008420343, |
|
"grad_norm": 0.7045179605484009, |
|
"learning_rate": 1.4999916933163468e-05, |
|
"loss": 0.2589, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.10239137756820478, |
|
"grad_norm": 0.6908326148986816, |
|
"learning_rate": 1.499981310004906e-05, |
|
"loss": 0.2727, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.10306500505220613, |
|
"grad_norm": 0.7265616655349731, |
|
"learning_rate": 1.4999667734493901e-05, |
|
"loss": 0.3177, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.10373863253620748, |
|
"grad_norm": 0.630407452583313, |
|
"learning_rate": 1.4999480837302995e-05, |
|
"loss": 0.2636, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.10441226002020883, |
|
"grad_norm": 0.6864127516746521, |
|
"learning_rate": 1.4999252409511335e-05, |
|
"loss": 0.3013, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.10508588750421018, |
|
"grad_norm": 0.7556886076927185, |
|
"learning_rate": 1.4998982452383916e-05, |
|
"loss": 0.279, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.10575951498821153, |
|
"grad_norm": 0.7267988324165344, |
|
"learning_rate": 1.4998670967415701e-05, |
|
"loss": 0.2528, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.10643314247221286, |
|
"grad_norm": 0.6894652843475342, |
|
"learning_rate": 1.4998317956331634e-05, |
|
"loss": 0.2833, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.10710676995621421, |
|
"grad_norm": 0.7065450549125671, |
|
"learning_rate": 1.4997923421086613e-05, |
|
"loss": 0.3159, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.10778039744021556, |
|
"grad_norm": 0.6692951321601868, |
|
"learning_rate": 1.49974873638655e-05, |
|
"loss": 0.2747, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.10845402492421691, |
|
"grad_norm": 0.589299738407135, |
|
"learning_rate": 1.4997009787083088e-05, |
|
"loss": 0.2436, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.10912765240821826, |
|
"grad_norm": 0.6986613869667053, |
|
"learning_rate": 1.49964906933841e-05, |
|
"loss": 0.2893, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.1098012798922196, |
|
"grad_norm": 0.6756588220596313, |
|
"learning_rate": 1.4995930085643173e-05, |
|
"loss": 0.3076, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.11047490737622095, |
|
"grad_norm": 0.6988603472709656, |
|
"learning_rate": 1.4995327966964838e-05, |
|
"loss": 0.2646, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.1111485348602223, |
|
"grad_norm": 0.6961201429367065, |
|
"learning_rate": 1.4994684340683506e-05, |
|
"loss": 0.2984, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.11182216234422364, |
|
"grad_norm": 0.7064459323883057, |
|
"learning_rate": 1.4993999210363444e-05, |
|
"loss": 0.3186, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.11249578982822499, |
|
"grad_norm": 0.6374897360801697, |
|
"learning_rate": 1.4993272579798773e-05, |
|
"loss": 0.2833, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.11316941731222634, |
|
"grad_norm": 0.6672942638397217, |
|
"learning_rate": 1.4992504453013422e-05, |
|
"loss": 0.2891, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.11384304479622769, |
|
"grad_norm": 0.6631248593330383, |
|
"learning_rate": 1.499169483426112e-05, |
|
"loss": 0.2512, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.11451667228022903, |
|
"grad_norm": 0.7132297158241272, |
|
"learning_rate": 1.4990843728025367e-05, |
|
"loss": 0.2988, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.11519029976423038, |
|
"grad_norm": 0.6612878441810608, |
|
"learning_rate": 1.4989951139019425e-05, |
|
"loss": 0.283, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.11586392724823173, |
|
"grad_norm": 0.6382921934127808, |
|
"learning_rate": 1.4989017072186267e-05, |
|
"loss": 0.2597, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.11653755473223308, |
|
"grad_norm": 0.5888513922691345, |
|
"learning_rate": 1.498804153269856e-05, |
|
"loss": 0.243, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.11721118221623442, |
|
"grad_norm": 0.7310932874679565, |
|
"learning_rate": 1.498702452595865e-05, |
|
"loss": 0.2871, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.11788480970023577, |
|
"grad_norm": 0.6769680380821228, |
|
"learning_rate": 1.4985966057598512e-05, |
|
"loss": 0.2896, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.11855843718423711, |
|
"grad_norm": 0.7013587355613708, |
|
"learning_rate": 1.4984866133479729e-05, |
|
"loss": 0.2913, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.11923206466823846, |
|
"grad_norm": 0.7067077159881592, |
|
"learning_rate": 1.4983724759693456e-05, |
|
"loss": 0.2931, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.11990569215223981, |
|
"grad_norm": 0.6384806632995605, |
|
"learning_rate": 1.498254194256039e-05, |
|
"loss": 0.2433, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.12057931963624116, |
|
"grad_norm": 0.733525276184082, |
|
"learning_rate": 1.4981317688630729e-05, |
|
"loss": 0.314, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.12125294712024251, |
|
"grad_norm": 0.6598628759384155, |
|
"learning_rate": 1.4980052004684146e-05, |
|
"loss": 0.281, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.12192657460424386, |
|
"grad_norm": 0.616263210773468, |
|
"learning_rate": 1.4978744897729741e-05, |
|
"loss": 0.2616, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.1226002020882452, |
|
"grad_norm": 0.6175768971443176, |
|
"learning_rate": 1.4977396375006006e-05, |
|
"loss": 0.2624, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.12327382957224654, |
|
"grad_norm": 0.676030695438385, |
|
"learning_rate": 1.4976006443980785e-05, |
|
"loss": 0.287, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.12394745705624789, |
|
"grad_norm": 0.6331183314323425, |
|
"learning_rate": 1.4974575112351235e-05, |
|
"loss": 0.2647, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.12462108454024924, |
|
"grad_norm": 0.656204104423523, |
|
"learning_rate": 1.497310238804378e-05, |
|
"loss": 0.2755, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.12529471202425058, |
|
"grad_norm": 0.6582143306732178, |
|
"learning_rate": 1.4971588279214065e-05, |
|
"loss": 0.2774, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.12596833950825193, |
|
"grad_norm": 0.6152216792106628, |
|
"learning_rate": 1.4970032794246918e-05, |
|
"loss": 0.2694, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.12664196699225327, |
|
"grad_norm": 0.5943458676338196, |
|
"learning_rate": 1.4968435941756303e-05, |
|
"loss": 0.2698, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.12731559447625462, |
|
"grad_norm": 0.7527596354484558, |
|
"learning_rate": 1.496679773058526e-05, |
|
"loss": 0.2996, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.12798922196025597, |
|
"grad_norm": 0.6229069828987122, |
|
"learning_rate": 1.4965118169805868e-05, |
|
"loss": 0.275, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.12866284944425732, |
|
"grad_norm": 0.620919406414032, |
|
"learning_rate": 1.4963397268719198e-05, |
|
"loss": 0.2956, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.12933647692825867, |
|
"grad_norm": 0.6090366244316101, |
|
"learning_rate": 1.4961635036855249e-05, |
|
"loss": 0.258, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.13001010441226002, |
|
"grad_norm": 0.5942346453666687, |
|
"learning_rate": 1.4959831483972901e-05, |
|
"loss": 0.266, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.13068373189626137, |
|
"grad_norm": 0.6019350290298462, |
|
"learning_rate": 1.4957986620059866e-05, |
|
"loss": 0.256, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.13135735938026272, |
|
"grad_norm": 0.6708882451057434, |
|
"learning_rate": 1.4956100455332623e-05, |
|
"loss": 0.2924, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.13203098686426407, |
|
"grad_norm": 0.7132793068885803, |
|
"learning_rate": 1.4954173000236369e-05, |
|
"loss": 0.3174, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.13270461434826542, |
|
"grad_norm": 0.602311909198761, |
|
"learning_rate": 1.495220426544496e-05, |
|
"loss": 0.2388, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.13337824183226676, |
|
"grad_norm": 0.5862560868263245, |
|
"learning_rate": 1.495019426186085e-05, |
|
"loss": 0.2382, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.1340518693162681, |
|
"grad_norm": 0.6618714332580566, |
|
"learning_rate": 1.4948143000615028e-05, |
|
"loss": 0.2654, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.13472549680026946, |
|
"grad_norm": 0.6195774078369141, |
|
"learning_rate": 1.4946050493066965e-05, |
|
"loss": 0.2696, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13472549680026946, |
|
"eval_loss": 0.2768155038356781, |
|
"eval_runtime": 105.0569, |
|
"eval_samples_per_second": 47.593, |
|
"eval_steps_per_second": 2.979, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1353991242842708, |
|
"grad_norm": 0.5954621434211731, |
|
"learning_rate": 1.4943916750804537e-05, |
|
"loss": 0.2625, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.13607275176827213, |
|
"grad_norm": 0.610717236995697, |
|
"learning_rate": 1.494174178564398e-05, |
|
"loss": 0.2953, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.13674637925227348, |
|
"grad_norm": 0.6930943727493286, |
|
"learning_rate": 1.4939525609629809e-05, |
|
"loss": 0.2774, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.13742000673627483, |
|
"grad_norm": 0.6402983069419861, |
|
"learning_rate": 1.4937268235034754e-05, |
|
"loss": 0.2814, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.13809363422027618, |
|
"grad_norm": 0.6476616859436035, |
|
"learning_rate": 1.4934969674359698e-05, |
|
"loss": 0.2829, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.13876726170427753, |
|
"grad_norm": 0.6163775324821472, |
|
"learning_rate": 1.49326299403336e-05, |
|
"loss": 0.2682, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.13944088918827888, |
|
"grad_norm": 0.6615155935287476, |
|
"learning_rate": 1.4930249045913437e-05, |
|
"loss": 0.2656, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.14011451667228023, |
|
"grad_norm": 0.6666435599327087, |
|
"learning_rate": 1.4927827004284117e-05, |
|
"loss": 0.2972, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.14078814415628157, |
|
"grad_norm": 0.6047382950782776, |
|
"learning_rate": 1.4925363828858407e-05, |
|
"loss": 0.2527, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.14146177164028292, |
|
"grad_norm": 0.6405648589134216, |
|
"learning_rate": 1.4922859533276882e-05, |
|
"loss": 0.2589, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.14213539912428427, |
|
"grad_norm": 0.6201145648956299, |
|
"learning_rate": 1.4920314131407817e-05, |
|
"loss": 0.2419, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.14280902660828562, |
|
"grad_norm": 0.6683364510536194, |
|
"learning_rate": 1.4917727637347132e-05, |
|
"loss": 0.2973, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.14348265409228697, |
|
"grad_norm": 0.5999878644943237, |
|
"learning_rate": 1.4915100065418302e-05, |
|
"loss": 0.2714, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.14415628157628832, |
|
"grad_norm": 0.6046174764633179, |
|
"learning_rate": 1.491243143017229e-05, |
|
"loss": 0.2841, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.14482990906028967, |
|
"grad_norm": 0.6034740209579468, |
|
"learning_rate": 1.4909721746387454e-05, |
|
"loss": 0.2896, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.14550353654429102, |
|
"grad_norm": 0.6835145354270935, |
|
"learning_rate": 1.4906971029069473e-05, |
|
"loss": 0.2778, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.14617716402829237, |
|
"grad_norm": 0.6769616603851318, |
|
"learning_rate": 1.490417929345126e-05, |
|
"loss": 0.2697, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.1468507915122937, |
|
"grad_norm": 0.6558434367179871, |
|
"learning_rate": 1.4901346554992879e-05, |
|
"loss": 0.2708, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.14752441899629504, |
|
"grad_norm": 0.6363021731376648, |
|
"learning_rate": 1.489847282938146e-05, |
|
"loss": 0.297, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.14819804648029639, |
|
"grad_norm": 0.6437724828720093, |
|
"learning_rate": 1.4895558132531112e-05, |
|
"loss": 0.2827, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.14887167396429773, |
|
"grad_norm": 0.6295124292373657, |
|
"learning_rate": 1.4892602480582836e-05, |
|
"loss": 0.2998, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.14954530144829908, |
|
"grad_norm": 0.634768545627594, |
|
"learning_rate": 1.4889605889904426e-05, |
|
"loss": 0.2686, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.15021892893230043, |
|
"grad_norm": 0.624239981174469, |
|
"learning_rate": 1.4886568377090396e-05, |
|
"loss": 0.3161, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.15089255641630178, |
|
"grad_norm": 0.6285136342048645, |
|
"learning_rate": 1.4883489958961875e-05, |
|
"loss": 0.3089, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.15156618390030313, |
|
"grad_norm": 0.6140178442001343, |
|
"learning_rate": 1.4880370652566516e-05, |
|
"loss": 0.2888, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.15223981138430448, |
|
"grad_norm": 0.5987722873687744, |
|
"learning_rate": 1.4877210475178403e-05, |
|
"loss": 0.2586, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.15291343886830583, |
|
"grad_norm": 0.6315680146217346, |
|
"learning_rate": 1.487400944429796e-05, |
|
"loss": 0.2876, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.15358706635230718, |
|
"grad_norm": 0.6932382583618164, |
|
"learning_rate": 1.487076757765184e-05, |
|
"loss": 0.2886, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.15426069383630853, |
|
"grad_norm": 0.5736963748931885, |
|
"learning_rate": 1.4867484893192847e-05, |
|
"loss": 0.2524, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.15493432132030988, |
|
"grad_norm": 0.6102257370948792, |
|
"learning_rate": 1.4864161409099814e-05, |
|
"loss": 0.2518, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.15560794880431122, |
|
"grad_norm": 0.5340930819511414, |
|
"learning_rate": 1.4860797143777526e-05, |
|
"loss": 0.2466, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.15628157628831257, |
|
"grad_norm": 0.6170995831489563, |
|
"learning_rate": 1.4857392115856597e-05, |
|
"loss": 0.2588, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.15695520377231392, |
|
"grad_norm": 0.5439332127571106, |
|
"learning_rate": 1.4853946344193386e-05, |
|
"loss": 0.2377, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.15762883125631524, |
|
"grad_norm": 0.6084430813789368, |
|
"learning_rate": 1.4850459847869866e-05, |
|
"loss": 0.2514, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.1583024587403166, |
|
"grad_norm": 0.6239585280418396, |
|
"learning_rate": 1.4846932646193554e-05, |
|
"loss": 0.2892, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.15897608622431794, |
|
"grad_norm": 0.6361899375915527, |
|
"learning_rate": 1.4843364758697371e-05, |
|
"loss": 0.264, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.1596497137083193, |
|
"grad_norm": 0.5994705557823181, |
|
"learning_rate": 1.4839756205139555e-05, |
|
"loss": 0.2756, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.16032334119232064, |
|
"grad_norm": 0.6532281041145325, |
|
"learning_rate": 1.4836107005503543e-05, |
|
"loss": 0.3262, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.160996968676322, |
|
"grad_norm": 0.6311124563217163, |
|
"learning_rate": 1.483241717999786e-05, |
|
"loss": 0.3137, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.16167059616032334, |
|
"grad_norm": 0.5731788873672485, |
|
"learning_rate": 1.4828686749056007e-05, |
|
"loss": 0.2476, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1623442236443247, |
|
"grad_norm": 0.5689460039138794, |
|
"learning_rate": 1.4824915733336355e-05, |
|
"loss": 0.2717, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.16301785112832604, |
|
"grad_norm": 0.6340669989585876, |
|
"learning_rate": 1.4821104153722023e-05, |
|
"loss": 0.2756, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.16369147861232738, |
|
"grad_norm": 0.6497682929039001, |
|
"learning_rate": 1.4817252031320766e-05, |
|
"loss": 0.3197, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.16436510609632873, |
|
"grad_norm": 0.6404630541801453, |
|
"learning_rate": 1.481335938746485e-05, |
|
"loss": 0.2641, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.16503873358033008, |
|
"grad_norm": 0.5862687230110168, |
|
"learning_rate": 1.480942624371095e-05, |
|
"loss": 0.261, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.16571236106433143, |
|
"grad_norm": 0.6154356598854065, |
|
"learning_rate": 1.4805452621840015e-05, |
|
"loss": 0.2856, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.16638598854833278, |
|
"grad_norm": 0.7411592602729797, |
|
"learning_rate": 1.4801438543857154e-05, |
|
"loss": 0.2838, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.16705961603233413, |
|
"grad_norm": 0.6304882764816284, |
|
"learning_rate": 1.479738403199152e-05, |
|
"loss": 0.3102, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.16773324351633548, |
|
"grad_norm": 0.5838252305984497, |
|
"learning_rate": 1.479328910869617e-05, |
|
"loss": 0.3074, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.16840687100033683, |
|
"grad_norm": 0.6592857241630554, |
|
"learning_rate": 1.4789153796647957e-05, |
|
"loss": 0.2482, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.16908049848433815, |
|
"grad_norm": 0.6678220629692078, |
|
"learning_rate": 1.4784978118747404e-05, |
|
"loss": 0.2858, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.1697541259683395, |
|
"grad_norm": 0.7072235345840454, |
|
"learning_rate": 1.4780762098118564e-05, |
|
"loss": 0.317, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.17042775345234085, |
|
"grad_norm": 0.6481045484542847, |
|
"learning_rate": 1.4776505758108901e-05, |
|
"loss": 0.3074, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.1711013809363422, |
|
"grad_norm": 0.573128342628479, |
|
"learning_rate": 1.477220912228916e-05, |
|
"loss": 0.2421, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.17177500842034354, |
|
"grad_norm": 0.5758487582206726, |
|
"learning_rate": 1.4767872214453241e-05, |
|
"loss": 0.2874, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.1724486359043449, |
|
"grad_norm": 0.5688092112541199, |
|
"learning_rate": 1.4763495058618056e-05, |
|
"loss": 0.2897, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.17312226338834624, |
|
"grad_norm": 0.607288658618927, |
|
"learning_rate": 1.4759077679023406e-05, |
|
"loss": 0.2707, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.1737958908723476, |
|
"grad_norm": 0.6363064646720886, |
|
"learning_rate": 1.4754620100131838e-05, |
|
"loss": 0.2977, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.17446951835634894, |
|
"grad_norm": 0.6312716007232666, |
|
"learning_rate": 1.475012234662852e-05, |
|
"loss": 0.2794, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.1751431458403503, |
|
"grad_norm": 0.6589624285697937, |
|
"learning_rate": 1.4745584443421097e-05, |
|
"loss": 0.3483, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.17581677332435164, |
|
"grad_norm": 0.5797691345214844, |
|
"learning_rate": 1.4741006415639555e-05, |
|
"loss": 0.3013, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.176490400808353, |
|
"grad_norm": 0.5717487335205078, |
|
"learning_rate": 1.473638828863608e-05, |
|
"loss": 0.2725, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.17716402829235434, |
|
"grad_norm": 0.6161592602729797, |
|
"learning_rate": 1.4731730087984924e-05, |
|
"loss": 0.3049, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.17783765577635569, |
|
"grad_norm": 0.6334370970726013, |
|
"learning_rate": 1.4727031839482251e-05, |
|
"loss": 0.2844, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.17851128326035703, |
|
"grad_norm": 0.576859176158905, |
|
"learning_rate": 1.472229356914601e-05, |
|
"loss": 0.244, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.17918491074435838, |
|
"grad_norm": 0.6241918802261353, |
|
"learning_rate": 1.4717515303215776e-05, |
|
"loss": 0.2838, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.1798585382283597, |
|
"grad_norm": 0.5989061594009399, |
|
"learning_rate": 1.4712697068152619e-05, |
|
"loss": 0.2984, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.18053216571236105, |
|
"grad_norm": 0.5685368180274963, |
|
"learning_rate": 1.4707838890638941e-05, |
|
"loss": 0.2787, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.1812057931963624, |
|
"grad_norm": 0.6349403262138367, |
|
"learning_rate": 1.4702940797578345e-05, |
|
"loss": 0.3078, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.18187942068036375, |
|
"grad_norm": 0.6529637575149536, |
|
"learning_rate": 1.4698002816095473e-05, |
|
"loss": 0.307, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1825530481643651, |
|
"grad_norm": 0.5679558515548706, |
|
"learning_rate": 1.4693024973535863e-05, |
|
"loss": 0.25, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.18322667564836645, |
|
"grad_norm": 0.5999310612678528, |
|
"learning_rate": 1.4688007297465796e-05, |
|
"loss": 0.259, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.1839003031323678, |
|
"grad_norm": 0.6034629344940186, |
|
"learning_rate": 1.4682949815672146e-05, |
|
"loss": 0.3071, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.18457393061636915, |
|
"grad_norm": 0.610670268535614, |
|
"learning_rate": 1.467785255616221e-05, |
|
"loss": 0.2913, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.1852475581003705, |
|
"grad_norm": 0.628016471862793, |
|
"learning_rate": 1.4672715547163584e-05, |
|
"loss": 0.2839, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.18592118558437185, |
|
"grad_norm": 0.6297721862792969, |
|
"learning_rate": 1.4667538817123977e-05, |
|
"loss": 0.3403, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.1865948130683732, |
|
"grad_norm": 0.540552020072937, |
|
"learning_rate": 1.4662322394711067e-05, |
|
"loss": 0.2454, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.18726844055237454, |
|
"grad_norm": 0.513788640499115, |
|
"learning_rate": 1.4657066308812342e-05, |
|
"loss": 0.233, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.1879420680363759, |
|
"grad_norm": 0.6221415996551514, |
|
"learning_rate": 1.4651770588534937e-05, |
|
"loss": 0.2969, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.18861569552037724, |
|
"grad_norm": 0.5859697461128235, |
|
"learning_rate": 1.4646435263205475e-05, |
|
"loss": 0.2771, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1892893230043786, |
|
"grad_norm": 0.5720670819282532, |
|
"learning_rate": 1.4641060362369904e-05, |
|
"loss": 0.2758, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.18996295048837994, |
|
"grad_norm": 0.5609393119812012, |
|
"learning_rate": 1.4635645915793333e-05, |
|
"loss": 0.256, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.19063657797238126, |
|
"grad_norm": 0.5734854340553284, |
|
"learning_rate": 1.4630191953459862e-05, |
|
"loss": 0.3233, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.1913102054563826, |
|
"grad_norm": 0.570590615272522, |
|
"learning_rate": 1.4624698505572432e-05, |
|
"loss": 0.2757, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.19198383294038396, |
|
"grad_norm": 0.623126208782196, |
|
"learning_rate": 1.4619165602552637e-05, |
|
"loss": 0.2964, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.1926574604243853, |
|
"grad_norm": 0.5599439144134521, |
|
"learning_rate": 1.4613593275040572e-05, |
|
"loss": 0.2582, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.19333108790838666, |
|
"grad_norm": 0.5614957809448242, |
|
"learning_rate": 1.4607981553894654e-05, |
|
"loss": 0.27, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.194004715392388, |
|
"grad_norm": 0.5625648498535156, |
|
"learning_rate": 1.4602330470191453e-05, |
|
"loss": 0.2751, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.19467834287638935, |
|
"grad_norm": 0.5504026412963867, |
|
"learning_rate": 1.4596640055225521e-05, |
|
"loss": 0.2429, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.1953519703603907, |
|
"grad_norm": 0.5794048309326172, |
|
"learning_rate": 1.4590910340509224e-05, |
|
"loss": 0.2882, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.19602559784439205, |
|
"grad_norm": 0.550942599773407, |
|
"learning_rate": 1.4585141357772554e-05, |
|
"loss": 0.2604, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.1966992253283934, |
|
"grad_norm": 0.6088408827781677, |
|
"learning_rate": 1.4579333138962966e-05, |
|
"loss": 0.2993, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.19737285281239475, |
|
"grad_norm": 0.6309805512428284, |
|
"learning_rate": 1.4573485716245193e-05, |
|
"loss": 0.297, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.1980464802963961, |
|
"grad_norm": 0.6433154344558716, |
|
"learning_rate": 1.456759912200108e-05, |
|
"loss": 0.2919, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.19872010778039745, |
|
"grad_norm": 0.6373067498207092, |
|
"learning_rate": 1.456167338882938e-05, |
|
"loss": 0.2719, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.1993937352643988, |
|
"grad_norm": 0.5514649748802185, |
|
"learning_rate": 1.4555708549545607e-05, |
|
"loss": 0.2638, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.20006736274840015, |
|
"grad_norm": 0.5804110169410706, |
|
"learning_rate": 1.4549704637181827e-05, |
|
"loss": 0.2828, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.2007409902324015, |
|
"grad_norm": 0.5397315621376038, |
|
"learning_rate": 1.4543661684986484e-05, |
|
"loss": 0.2712, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.20141461771640282, |
|
"grad_norm": 0.6435424089431763, |
|
"learning_rate": 1.4537579726424221e-05, |
|
"loss": 0.3095, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.20208824520040417, |
|
"grad_norm": 0.5241397023200989, |
|
"learning_rate": 1.453145879517569e-05, |
|
"loss": 0.2635, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.20208824520040417, |
|
"eval_loss": 0.2736159861087799, |
|
"eval_runtime": 107.1602, |
|
"eval_samples_per_second": 46.659, |
|
"eval_steps_per_second": 2.921, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.20276187268440551, |
|
"grad_norm": 0.5774008631706238, |
|
"learning_rate": 1.4525298925137362e-05, |
|
"loss": 0.2752, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.20343550016840686, |
|
"grad_norm": 0.5994575619697571, |
|
"learning_rate": 1.4519100150421343e-05, |
|
"loss": 0.3073, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.2041091276524082, |
|
"grad_norm": 0.5691470503807068, |
|
"learning_rate": 1.4512862505355195e-05, |
|
"loss": 0.2846, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.20478275513640956, |
|
"grad_norm": 0.5722606182098389, |
|
"learning_rate": 1.450658602448172e-05, |
|
"loss": 0.2549, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.2054563826204109, |
|
"grad_norm": 0.632279634475708, |
|
"learning_rate": 1.45002707425588e-05, |
|
"loss": 0.3197, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.20613001010441226, |
|
"grad_norm": 0.5538962483406067, |
|
"learning_rate": 1.449391669455918e-05, |
|
"loss": 0.2656, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.2068036375884136, |
|
"grad_norm": 0.5925297737121582, |
|
"learning_rate": 1.4487523915670286e-05, |
|
"loss": 0.2821, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.20747726507241496, |
|
"grad_norm": 0.6299713850021362, |
|
"learning_rate": 1.448109244129403e-05, |
|
"loss": 0.3116, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.2081508925564163, |
|
"grad_norm": 0.6114513874053955, |
|
"learning_rate": 1.447462230704661e-05, |
|
"loss": 0.285, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.20882452004041765, |
|
"grad_norm": 0.5723987817764282, |
|
"learning_rate": 1.4468113548758313e-05, |
|
"loss": 0.278, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.209498147524419, |
|
"grad_norm": 0.5769573450088501, |
|
"learning_rate": 1.4461566202473322e-05, |
|
"loss": 0.2892, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.21017177500842035, |
|
"grad_norm": 0.6040593981742859, |
|
"learning_rate": 1.4454980304449506e-05, |
|
"loss": 0.3123, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.2108454024924217, |
|
"grad_norm": 0.5362566113471985, |
|
"learning_rate": 1.4448355891158235e-05, |
|
"loss": 0.24, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.21151902997642305, |
|
"grad_norm": 0.560070812702179, |
|
"learning_rate": 1.4441692999284159e-05, |
|
"loss": 0.2663, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.21219265746042437, |
|
"grad_norm": 0.6649965047836304, |
|
"learning_rate": 1.443499166572502e-05, |
|
"loss": 0.3441, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.21286628494442572, |
|
"grad_norm": 0.5337359309196472, |
|
"learning_rate": 1.4428251927591445e-05, |
|
"loss": 0.253, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.21353991242842707, |
|
"grad_norm": 0.6185274720191956, |
|
"learning_rate": 1.4421473822206729e-05, |
|
"loss": 0.305, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.21421353991242842, |
|
"grad_norm": 0.5125210881233215, |
|
"learning_rate": 1.4414657387106646e-05, |
|
"loss": 0.2774, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.21488716739642977, |
|
"grad_norm": 0.5758813619613647, |
|
"learning_rate": 1.4407802660039226e-05, |
|
"loss": 0.2484, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.21556079488043112, |
|
"grad_norm": 0.5220269560813904, |
|
"learning_rate": 1.4400909678964556e-05, |
|
"loss": 0.2399, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.21623442236443247, |
|
"grad_norm": 0.5919392704963684, |
|
"learning_rate": 1.4393978482054561e-05, |
|
"loss": 0.2924, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.21690804984843381, |
|
"grad_norm": 0.5359899997711182, |
|
"learning_rate": 1.4387009107692808e-05, |
|
"loss": 0.2493, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.21758167733243516, |
|
"grad_norm": 0.568356454372406, |
|
"learning_rate": 1.4380001594474267e-05, |
|
"loss": 0.2877, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.2182553048164365, |
|
"grad_norm": 0.5183501243591309, |
|
"learning_rate": 1.4372955981205127e-05, |
|
"loss": 0.262, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.21892893230043786, |
|
"grad_norm": 0.5353648662567139, |
|
"learning_rate": 1.436587230690256e-05, |
|
"loss": 0.269, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2196025597844392, |
|
"grad_norm": 0.5863710641860962, |
|
"learning_rate": 1.4358750610794522e-05, |
|
"loss": 0.2933, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.22027618726844056, |
|
"grad_norm": 0.5193360447883606, |
|
"learning_rate": 1.4351590932319506e-05, |
|
"loss": 0.2539, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.2209498147524419, |
|
"grad_norm": 0.521597146987915, |
|
"learning_rate": 1.4344393311126367e-05, |
|
"loss": 0.24, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.22162344223644326, |
|
"grad_norm": 0.5621289014816284, |
|
"learning_rate": 1.4337157787074063e-05, |
|
"loss": 0.2647, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.2222970697204446, |
|
"grad_norm": 0.6134183406829834, |
|
"learning_rate": 1.432988440023146e-05, |
|
"loss": 0.2846, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.22297069720444593, |
|
"grad_norm": 0.5819990634918213, |
|
"learning_rate": 1.4322573190877091e-05, |
|
"loss": 0.2725, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.22364432468844728, |
|
"grad_norm": 0.6009438037872314, |
|
"learning_rate": 1.4315224199498952e-05, |
|
"loss": 0.2507, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.22431795217244863, |
|
"grad_norm": 0.5484105944633484, |
|
"learning_rate": 1.4307837466794258e-05, |
|
"loss": 0.2715, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.22499157965644997, |
|
"grad_norm": 0.5025244951248169, |
|
"learning_rate": 1.4300413033669241e-05, |
|
"loss": 0.2257, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.22566520714045132, |
|
"grad_norm": 0.5583484172821045, |
|
"learning_rate": 1.4292950941238898e-05, |
|
"loss": 0.3015, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.22633883462445267, |
|
"grad_norm": 0.5975006222724915, |
|
"learning_rate": 1.4285451230826783e-05, |
|
"loss": 0.2924, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.22701246210845402, |
|
"grad_norm": 0.6017155051231384, |
|
"learning_rate": 1.4277913943964763e-05, |
|
"loss": 0.2928, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.22768608959245537, |
|
"grad_norm": 0.5619384050369263, |
|
"learning_rate": 1.4270339122392808e-05, |
|
"loss": 0.2744, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.22835971707645672, |
|
"grad_norm": 0.576554536819458, |
|
"learning_rate": 1.4262726808058735e-05, |
|
"loss": 0.3019, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.22903334456045807, |
|
"grad_norm": 0.5621641874313354, |
|
"learning_rate": 1.4255077043117994e-05, |
|
"loss": 0.2801, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.22970697204445942, |
|
"grad_norm": 0.5104705095291138, |
|
"learning_rate": 1.424738986993343e-05, |
|
"loss": 0.2572, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.23038059952846077, |
|
"grad_norm": 0.5731213688850403, |
|
"learning_rate": 1.4239665331075048e-05, |
|
"loss": 0.2545, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.23105422701246212, |
|
"grad_norm": 0.6381127238273621, |
|
"learning_rate": 1.4231903469319772e-05, |
|
"loss": 0.3023, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.23172785449646346, |
|
"grad_norm": 0.5358138680458069, |
|
"learning_rate": 1.4224104327651213e-05, |
|
"loss": 0.2597, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.2324014819804648, |
|
"grad_norm": 0.5517827272415161, |
|
"learning_rate": 1.4216267949259437e-05, |
|
"loss": 0.2669, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.23307510946446616, |
|
"grad_norm": 0.5380638241767883, |
|
"learning_rate": 1.4208394377540712e-05, |
|
"loss": 0.2706, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.23374873694846748, |
|
"grad_norm": 0.6162987351417542, |
|
"learning_rate": 1.4200483656097278e-05, |
|
"loss": 0.2721, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.23442236443246883, |
|
"grad_norm": 0.6142714619636536, |
|
"learning_rate": 1.4192535828737102e-05, |
|
"loss": 0.3158, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.23509599191647018, |
|
"grad_norm": 0.6231828331947327, |
|
"learning_rate": 1.4184550939473644e-05, |
|
"loss": 0.3022, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.23576961940047153, |
|
"grad_norm": 0.5371239185333252, |
|
"learning_rate": 1.4176529032525584e-05, |
|
"loss": 0.2372, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.23644324688447288, |
|
"grad_norm": 0.5987442135810852, |
|
"learning_rate": 1.4168470152316624e-05, |
|
"loss": 0.2856, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.23711687436847423, |
|
"grad_norm": 0.5490831732749939, |
|
"learning_rate": 1.41603743434752e-05, |
|
"loss": 0.2352, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.23779050185247558, |
|
"grad_norm": 0.5611885786056519, |
|
"learning_rate": 1.415224165083426e-05, |
|
"loss": 0.2763, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.23846412933647693, |
|
"grad_norm": 0.5451275706291199, |
|
"learning_rate": 1.4144072119431e-05, |
|
"loss": 0.2725, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.23913775682047828, |
|
"grad_norm": 0.5789247155189514, |
|
"learning_rate": 1.413586579450662e-05, |
|
"loss": 0.2604, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.23981138430447962, |
|
"grad_norm": 0.6164606213569641, |
|
"learning_rate": 1.4127622721506087e-05, |
|
"loss": 0.2932, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.24048501178848097, |
|
"grad_norm": 0.5564325451850891, |
|
"learning_rate": 1.4119342946077864e-05, |
|
"loss": 0.2735, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.24115863927248232, |
|
"grad_norm": 0.6473014950752258, |
|
"learning_rate": 1.4111026514073657e-05, |
|
"loss": 0.2808, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.24183226675648367, |
|
"grad_norm": 0.5950415730476379, |
|
"learning_rate": 1.4102673471548186e-05, |
|
"loss": 0.2819, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.24250589424048502, |
|
"grad_norm": 0.576295793056488, |
|
"learning_rate": 1.4094283864758896e-05, |
|
"loss": 0.2818, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.24317952172448637, |
|
"grad_norm": 0.5290201306343079, |
|
"learning_rate": 1.4085857740165727e-05, |
|
"loss": 0.2731, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.24385314920848772, |
|
"grad_norm": 0.5469079613685608, |
|
"learning_rate": 1.4077395144430845e-05, |
|
"loss": 0.2533, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.24452677669248907, |
|
"grad_norm": 0.553629457950592, |
|
"learning_rate": 1.4068896124418383e-05, |
|
"loss": 0.2784, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.2452004041764904, |
|
"grad_norm": 0.5426369905471802, |
|
"learning_rate": 1.4060360727194188e-05, |
|
"loss": 0.2687, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.24587403166049174, |
|
"grad_norm": 0.5466113686561584, |
|
"learning_rate": 1.4051789000025555e-05, |
|
"loss": 0.2721, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.2465476591444931, |
|
"grad_norm": 0.5685258507728577, |
|
"learning_rate": 1.4043180990380968e-05, |
|
"loss": 0.283, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.24722128662849444, |
|
"grad_norm": 0.5648797154426575, |
|
"learning_rate": 1.4034536745929835e-05, |
|
"loss": 0.2579, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.24789491411249578, |
|
"grad_norm": 0.5363840460777283, |
|
"learning_rate": 1.4025856314542223e-05, |
|
"loss": 0.2577, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.24856854159649713, |
|
"grad_norm": 0.5171375870704651, |
|
"learning_rate": 1.40171397442886e-05, |
|
"loss": 0.2351, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.24924216908049848, |
|
"grad_norm": 0.646500825881958, |
|
"learning_rate": 1.4008387083439554e-05, |
|
"loss": 0.3039, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.24991579656449983, |
|
"grad_norm": 0.5827479362487793, |
|
"learning_rate": 1.3999598380465552e-05, |
|
"loss": 0.2913, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.25058942404850115, |
|
"grad_norm": 0.5602329969406128, |
|
"learning_rate": 1.3990773684036636e-05, |
|
"loss": 0.2822, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.2512630515325025, |
|
"grad_norm": 0.5731973648071289, |
|
"learning_rate": 1.3981913043022187e-05, |
|
"loss": 0.2638, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.25193667901650385, |
|
"grad_norm": 0.6127945780754089, |
|
"learning_rate": 1.397301650649063e-05, |
|
"loss": 0.314, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.2526103065005052, |
|
"grad_norm": 0.5554071664810181, |
|
"learning_rate": 1.396408412370918e-05, |
|
"loss": 0.2575, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.25328393398450655, |
|
"grad_norm": 0.5913053750991821, |
|
"learning_rate": 1.3955115944143558e-05, |
|
"loss": 0.2669, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.2539575614685079, |
|
"grad_norm": 0.6104479432106018, |
|
"learning_rate": 1.3946112017457715e-05, |
|
"loss": 0.2575, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.25463118895250925, |
|
"grad_norm": 0.6109972596168518, |
|
"learning_rate": 1.393707239351357e-05, |
|
"loss": 0.3141, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.2553048164365106, |
|
"grad_norm": 0.605560302734375, |
|
"learning_rate": 1.3927997122370724e-05, |
|
"loss": 0.2869, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.25597844392051194, |
|
"grad_norm": 0.5215985774993896, |
|
"learning_rate": 1.3918886254286182e-05, |
|
"loss": 0.2464, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2566520714045133, |
|
"grad_norm": 0.5480206608772278, |
|
"learning_rate": 1.3909739839714081e-05, |
|
"loss": 0.2713, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.25732569888851464, |
|
"grad_norm": 0.5150758028030396, |
|
"learning_rate": 1.3900557929305408e-05, |
|
"loss": 0.2537, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.257999326372516, |
|
"grad_norm": 0.606860876083374, |
|
"learning_rate": 1.3891340573907715e-05, |
|
"loss": 0.2929, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.25867295385651734, |
|
"grad_norm": 0.5383312106132507, |
|
"learning_rate": 1.3882087824564841e-05, |
|
"loss": 0.2778, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.2593465813405187, |
|
"grad_norm": 0.5356404185295105, |
|
"learning_rate": 1.3872799732516635e-05, |
|
"loss": 0.2318, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.26002020882452004, |
|
"grad_norm": 0.5665723085403442, |
|
"learning_rate": 1.386347634919866e-05, |
|
"loss": 0.2898, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.2606938363085214, |
|
"grad_norm": 0.5390300750732422, |
|
"learning_rate": 1.3854117726241922e-05, |
|
"loss": 0.2789, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.26136746379252274, |
|
"grad_norm": 0.5479271411895752, |
|
"learning_rate": 1.3844723915472568e-05, |
|
"loss": 0.2552, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.2620410912765241, |
|
"grad_norm": 0.6038428544998169, |
|
"learning_rate": 1.3835294968911615e-05, |
|
"loss": 0.3018, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.26271471876052543, |
|
"grad_norm": 0.5380761027336121, |
|
"learning_rate": 1.3825830938774653e-05, |
|
"loss": 0.2683, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2633883462445268, |
|
"grad_norm": 0.5072317719459534, |
|
"learning_rate": 1.3816331877471562e-05, |
|
"loss": 0.2728, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.26406197372852813, |
|
"grad_norm": 0.5953329205513, |
|
"learning_rate": 1.3806797837606206e-05, |
|
"loss": 0.2644, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.2647356012125295, |
|
"grad_norm": 0.5941304564476013, |
|
"learning_rate": 1.3797228871976162e-05, |
|
"loss": 0.2841, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.26540922869653083, |
|
"grad_norm": 0.6646502614021301, |
|
"learning_rate": 1.378762503357242e-05, |
|
"loss": 0.2966, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.2660828561805322, |
|
"grad_norm": 0.545456051826477, |
|
"learning_rate": 1.377798637557908e-05, |
|
"loss": 0.2481, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.26675648366453353, |
|
"grad_norm": 0.5886520147323608, |
|
"learning_rate": 1.3768312951373076e-05, |
|
"loss": 0.2735, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.2674301111485349, |
|
"grad_norm": 0.5731514096260071, |
|
"learning_rate": 1.3758604814523863e-05, |
|
"loss": 0.2953, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.2681037386325362, |
|
"grad_norm": 0.5029922723770142, |
|
"learning_rate": 1.3748862018793131e-05, |
|
"loss": 0.228, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.2687773661165376, |
|
"grad_norm": 0.557115375995636, |
|
"learning_rate": 1.3739084618134502e-05, |
|
"loss": 0.2861, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.2694509936005389, |
|
"grad_norm": 0.5246098041534424, |
|
"learning_rate": 1.3729272666693235e-05, |
|
"loss": 0.2705, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2694509936005389, |
|
"eval_loss": 0.2706840932369232, |
|
"eval_runtime": 105.373, |
|
"eval_samples_per_second": 47.451, |
|
"eval_steps_per_second": 2.97, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2701246210845403, |
|
"grad_norm": 0.5355361104011536, |
|
"learning_rate": 1.371942621880592e-05, |
|
"loss": 0.249, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.2707982485685416, |
|
"grad_norm": 0.5726237297058105, |
|
"learning_rate": 1.3709545329000187e-05, |
|
"loss": 0.2849, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.27147187605254297, |
|
"grad_norm": 0.5560792088508606, |
|
"learning_rate": 1.3699630051994395e-05, |
|
"loss": 0.2397, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.27214550353654426, |
|
"grad_norm": 0.509462833404541, |
|
"learning_rate": 1.3689680442697332e-05, |
|
"loss": 0.2412, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.2728191310205456, |
|
"grad_norm": 0.5348261594772339, |
|
"learning_rate": 1.3679696556207913e-05, |
|
"loss": 0.2588, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.27349275850454696, |
|
"grad_norm": 0.5228528380393982, |
|
"learning_rate": 1.3669678447814871e-05, |
|
"loss": 0.2482, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.2741663859885483, |
|
"grad_norm": 0.5533547401428223, |
|
"learning_rate": 1.3659626172996459e-05, |
|
"loss": 0.2581, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.27484001347254966, |
|
"grad_norm": 0.538163959980011, |
|
"learning_rate": 1.3649539787420126e-05, |
|
"loss": 0.2444, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.275513640956551, |
|
"grad_norm": 0.6091170907020569, |
|
"learning_rate": 1.3639419346942227e-05, |
|
"loss": 0.2963, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.27618726844055236, |
|
"grad_norm": 0.5507506728172302, |
|
"learning_rate": 1.3629264907607709e-05, |
|
"loss": 0.2835, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2768608959245537, |
|
"grad_norm": 0.5167334079742432, |
|
"learning_rate": 1.361907652564979e-05, |
|
"loss": 0.2751, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.27753452340855506, |
|
"grad_norm": 0.6182762384414673, |
|
"learning_rate": 1.3608854257489656e-05, |
|
"loss": 0.2953, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.2782081508925564, |
|
"grad_norm": 0.6356998085975647, |
|
"learning_rate": 1.3598598159736155e-05, |
|
"loss": 0.2586, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.27888177837655775, |
|
"grad_norm": 0.5957326889038086, |
|
"learning_rate": 1.358830828918547e-05, |
|
"loss": 0.283, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.2795554058605591, |
|
"grad_norm": 0.5173368453979492, |
|
"learning_rate": 1.3577984702820811e-05, |
|
"loss": 0.2403, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.28022903334456045, |
|
"grad_norm": 0.5449368357658386, |
|
"learning_rate": 1.3567627457812107e-05, |
|
"loss": 0.2641, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.2809026608285618, |
|
"grad_norm": 0.6340479850769043, |
|
"learning_rate": 1.355723661151567e-05, |
|
"loss": 0.3286, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.28157628831256315, |
|
"grad_norm": 0.49671491980552673, |
|
"learning_rate": 1.3546812221473898e-05, |
|
"loss": 0.2585, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.2822499157965645, |
|
"grad_norm": 0.5974727272987366, |
|
"learning_rate": 1.3536354345414944e-05, |
|
"loss": 0.2674, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.28292354328056585, |
|
"grad_norm": 0.5984825491905212, |
|
"learning_rate": 1.35258630412524e-05, |
|
"loss": 0.2548, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2835971707645672, |
|
"grad_norm": 0.5152942538261414, |
|
"learning_rate": 1.3515338367084975e-05, |
|
"loss": 0.2323, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.28427079824856855, |
|
"grad_norm": 0.5210486054420471, |
|
"learning_rate": 1.3504780381196178e-05, |
|
"loss": 0.2538, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.2849444257325699, |
|
"grad_norm": 0.6852086782455444, |
|
"learning_rate": 1.3494189142053988e-05, |
|
"loss": 0.3409, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.28561805321657124, |
|
"grad_norm": 0.5637288689613342, |
|
"learning_rate": 1.3483564708310535e-05, |
|
"loss": 0.2435, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.2862916807005726, |
|
"grad_norm": 0.565467357635498, |
|
"learning_rate": 1.3472907138801775e-05, |
|
"loss": 0.2699, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.28696530818457394, |
|
"grad_norm": 0.6443371176719666, |
|
"learning_rate": 1.346221649254716e-05, |
|
"loss": 0.3226, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.2876389356685753, |
|
"grad_norm": 0.5877301096916199, |
|
"learning_rate": 1.3451492828749317e-05, |
|
"loss": 0.2626, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.28831256315257664, |
|
"grad_norm": 0.635368824005127, |
|
"learning_rate": 1.3440736206793717e-05, |
|
"loss": 0.2808, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.288986190636578, |
|
"grad_norm": 0.5623096823692322, |
|
"learning_rate": 1.3429946686248346e-05, |
|
"loss": 0.2583, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.28965981812057934, |
|
"grad_norm": 0.5355499386787415, |
|
"learning_rate": 1.341912432686338e-05, |
|
"loss": 0.2425, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2903334456045807, |
|
"grad_norm": 0.5870991349220276, |
|
"learning_rate": 1.3408269188570837e-05, |
|
"loss": 0.2638, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.29100707308858204, |
|
"grad_norm": 0.5296127796173096, |
|
"learning_rate": 1.3397381331484273e-05, |
|
"loss": 0.2587, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.2916807005725834, |
|
"grad_norm": 0.5635933876037598, |
|
"learning_rate": 1.3386460815898427e-05, |
|
"loss": 0.2966, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.29235432805658473, |
|
"grad_norm": 0.5246622562408447, |
|
"learning_rate": 1.3375507702288894e-05, |
|
"loss": 0.2513, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.2930279555405861, |
|
"grad_norm": 0.6050205826759338, |
|
"learning_rate": 1.3364522051311793e-05, |
|
"loss": 0.3016, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.2937015830245874, |
|
"grad_norm": 0.5831138491630554, |
|
"learning_rate": 1.3353503923803424e-05, |
|
"loss": 0.312, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.2943752105085887, |
|
"grad_norm": 0.5354754328727722, |
|
"learning_rate": 1.3342453380779939e-05, |
|
"loss": 0.2743, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.2950488379925901, |
|
"grad_norm": 0.6059128642082214, |
|
"learning_rate": 1.3331370483437e-05, |
|
"loss": 0.2836, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.2957224654765914, |
|
"grad_norm": 0.6208754181861877, |
|
"learning_rate": 1.332025529314944e-05, |
|
"loss": 0.3069, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.29639609296059277, |
|
"grad_norm": 0.5791683197021484, |
|
"learning_rate": 1.3309107871470922e-05, |
|
"loss": 0.2904, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2970697204445941, |
|
"grad_norm": 0.5765690803527832, |
|
"learning_rate": 1.3297928280133606e-05, |
|
"loss": 0.3015, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.29774334792859547, |
|
"grad_norm": 0.5978572368621826, |
|
"learning_rate": 1.3286716581047791e-05, |
|
"loss": 0.2827, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.2984169754125968, |
|
"grad_norm": 0.5690959692001343, |
|
"learning_rate": 1.3275472836301592e-05, |
|
"loss": 0.2819, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.29909060289659817, |
|
"grad_norm": 0.5888264775276184, |
|
"learning_rate": 1.3264197108160582e-05, |
|
"loss": 0.297, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.2997642303805995, |
|
"grad_norm": 0.566338837146759, |
|
"learning_rate": 1.3252889459067452e-05, |
|
"loss": 0.2703, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.30043785786460087, |
|
"grad_norm": 0.5249893665313721, |
|
"learning_rate": 1.3241549951641663e-05, |
|
"loss": 0.252, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.3011114853486022, |
|
"grad_norm": 0.6007825136184692, |
|
"learning_rate": 1.3230178648679102e-05, |
|
"loss": 0.2696, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.30178511283260356, |
|
"grad_norm": 0.5482873916625977, |
|
"learning_rate": 1.3218775613151737e-05, |
|
"loss": 0.2523, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.3024587403166049, |
|
"grad_norm": 0.6056886315345764, |
|
"learning_rate": 1.3207340908207258e-05, |
|
"loss": 0.2616, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.30313236780060626, |
|
"grad_norm": 0.5885447859764099, |
|
"learning_rate": 1.319587459716874e-05, |
|
"loss": 0.2976, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3038059952846076, |
|
"grad_norm": 0.5747894644737244, |
|
"learning_rate": 1.318437674353428e-05, |
|
"loss": 0.2898, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.30447962276860896, |
|
"grad_norm": 0.569401741027832, |
|
"learning_rate": 1.3172847410976658e-05, |
|
"loss": 0.3104, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.3051532502526103, |
|
"grad_norm": 0.5612210631370544, |
|
"learning_rate": 1.3161286663342972e-05, |
|
"loss": 0.2825, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.30582687773661166, |
|
"grad_norm": 0.5914261937141418, |
|
"learning_rate": 1.3149694564654295e-05, |
|
"loss": 0.2781, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.306500505220613, |
|
"grad_norm": 0.5259233713150024, |
|
"learning_rate": 1.3138071179105314e-05, |
|
"loss": 0.2542, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.30717413270461436, |
|
"grad_norm": 0.5168178081512451, |
|
"learning_rate": 1.3126416571063972e-05, |
|
"loss": 0.2514, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.3078477601886157, |
|
"grad_norm": 0.5078200101852417, |
|
"learning_rate": 1.3114730805071123e-05, |
|
"loss": 0.2422, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.30852138767261705, |
|
"grad_norm": 0.5727274417877197, |
|
"learning_rate": 1.3103013945840166e-05, |
|
"loss": 0.2809, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.3091950151566184, |
|
"grad_norm": 0.5502845048904419, |
|
"learning_rate": 1.309126605825668e-05, |
|
"loss": 0.2552, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.30986864264061975, |
|
"grad_norm": 0.5696067214012146, |
|
"learning_rate": 1.3079487207378084e-05, |
|
"loss": 0.2959, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3105422701246211, |
|
"grad_norm": 0.5644879341125488, |
|
"learning_rate": 1.3067677458433258e-05, |
|
"loss": 0.2713, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.31121589760862245, |
|
"grad_norm": 0.5638664364814758, |
|
"learning_rate": 1.3055836876822196e-05, |
|
"loss": 0.2687, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.3118895250926238, |
|
"grad_norm": 0.5337838530540466, |
|
"learning_rate": 1.3043965528115625e-05, |
|
"loss": 0.2238, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.31256315257662515, |
|
"grad_norm": 0.5844706892967224, |
|
"learning_rate": 1.3032063478054666e-05, |
|
"loss": 0.268, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.3132367800606265, |
|
"grad_norm": 0.6730402112007141, |
|
"learning_rate": 1.3020130792550456e-05, |
|
"loss": 0.2976, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.31391040754462785, |
|
"grad_norm": 0.5756520628929138, |
|
"learning_rate": 1.3008167537683776e-05, |
|
"loss": 0.2859, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.3145840350286292, |
|
"grad_norm": 0.5855886340141296, |
|
"learning_rate": 1.2996173779704704e-05, |
|
"loss": 0.2997, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.3152576625126305, |
|
"grad_norm": 0.5359857082366943, |
|
"learning_rate": 1.2984149585032237e-05, |
|
"loss": 0.2814, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.31593128999663184, |
|
"grad_norm": 0.5448024868965149, |
|
"learning_rate": 1.2972095020253912e-05, |
|
"loss": 0.2681, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.3166049174806332, |
|
"grad_norm": 0.518844723701477, |
|
"learning_rate": 1.296001015212547e-05, |
|
"loss": 0.2538, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.31727854496463453, |
|
"grad_norm": 0.5422329306602478, |
|
"learning_rate": 1.2947895047570446e-05, |
|
"loss": 0.2346, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.3179521724486359, |
|
"grad_norm": 0.5567420721054077, |
|
"learning_rate": 1.2935749773679833e-05, |
|
"loss": 0.259, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.31862579993263723, |
|
"grad_norm": 0.5199055671691895, |
|
"learning_rate": 1.2923574397711684e-05, |
|
"loss": 0.2273, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.3192994274166386, |
|
"grad_norm": 0.552947461605072, |
|
"learning_rate": 1.291136898709076e-05, |
|
"loss": 0.2541, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.31997305490063993, |
|
"grad_norm": 0.537124752998352, |
|
"learning_rate": 1.2899133609408146e-05, |
|
"loss": 0.2709, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.3206466823846413, |
|
"grad_norm": 0.5493146777153015, |
|
"learning_rate": 1.2886868332420873e-05, |
|
"loss": 0.2838, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.32132030986864263, |
|
"grad_norm": 0.6109126806259155, |
|
"learning_rate": 1.2874573224051556e-05, |
|
"loss": 0.3088, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.321993937352644, |
|
"grad_norm": 0.5879717469215393, |
|
"learning_rate": 1.2862248352388005e-05, |
|
"loss": 0.282, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.3226675648366453, |
|
"grad_norm": 0.5227838754653931, |
|
"learning_rate": 1.2849893785682852e-05, |
|
"loss": 0.2646, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.3233411923206467, |
|
"grad_norm": 0.4744933545589447, |
|
"learning_rate": 1.2837509592353181e-05, |
|
"loss": 0.2219, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.324014819804648, |
|
"grad_norm": 0.508222758769989, |
|
"learning_rate": 1.2825095840980133e-05, |
|
"loss": 0.2698, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.3246884472886494, |
|
"grad_norm": 0.5351443290710449, |
|
"learning_rate": 1.2812652600308544e-05, |
|
"loss": 0.2617, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.3253620747726507, |
|
"grad_norm": 0.5842475295066833, |
|
"learning_rate": 1.2800179939246552e-05, |
|
"loss": 0.2496, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.32603570225665207, |
|
"grad_norm": 0.5165258646011353, |
|
"learning_rate": 1.2787677926865216e-05, |
|
"loss": 0.2399, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.3267093297406534, |
|
"grad_norm": 0.5721768736839294, |
|
"learning_rate": 1.2775146632398142e-05, |
|
"loss": 0.2754, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.32738295722465477, |
|
"grad_norm": 0.47171083092689514, |
|
"learning_rate": 1.2762586125241093e-05, |
|
"loss": 0.2107, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.3280565847086561, |
|
"grad_norm": 0.5318099856376648, |
|
"learning_rate": 1.2749996474951603e-05, |
|
"loss": 0.2422, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.32873021219265747, |
|
"grad_norm": 0.5478540062904358, |
|
"learning_rate": 1.2737377751248598e-05, |
|
"loss": 0.2634, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.3294038396766588, |
|
"grad_norm": 0.4972551167011261, |
|
"learning_rate": 1.2724730024012002e-05, |
|
"loss": 0.232, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.33007746716066017, |
|
"grad_norm": 0.6141415238380432, |
|
"learning_rate": 1.2712053363282363e-05, |
|
"loss": 0.2998, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3307510946446615, |
|
"grad_norm": 0.5177733302116394, |
|
"learning_rate": 1.2699347839260448e-05, |
|
"loss": 0.2574, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.33142472212866286, |
|
"grad_norm": 0.5531916618347168, |
|
"learning_rate": 1.268661352230687e-05, |
|
"loss": 0.2719, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.3320983496126642, |
|
"grad_norm": 0.5089963674545288, |
|
"learning_rate": 1.2673850482941687e-05, |
|
"loss": 0.2508, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.33277197709666556, |
|
"grad_norm": 0.557072103023529, |
|
"learning_rate": 1.2661058791844016e-05, |
|
"loss": 0.2823, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.3334456045806669, |
|
"grad_norm": 0.6557756662368774, |
|
"learning_rate": 1.2648238519851644e-05, |
|
"loss": 0.2821, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.33411923206466826, |
|
"grad_norm": 0.5633836984634399, |
|
"learning_rate": 1.2635389737960632e-05, |
|
"loss": 0.2576, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.3347928595486696, |
|
"grad_norm": 0.594456136226654, |
|
"learning_rate": 1.262251251732492e-05, |
|
"loss": 0.2985, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.33546648703267096, |
|
"grad_norm": 0.5753186345100403, |
|
"learning_rate": 1.2609606929255942e-05, |
|
"loss": 0.2775, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.3361401145166723, |
|
"grad_norm": 0.6262162327766418, |
|
"learning_rate": 1.259667304522222e-05, |
|
"loss": 0.3254, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.33681374200067365, |
|
"grad_norm": 0.5529574155807495, |
|
"learning_rate": 1.2583710936848977e-05, |
|
"loss": 0.2711, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.33681374200067365, |
|
"eval_loss": 0.2681807279586792, |
|
"eval_runtime": 104.7062, |
|
"eval_samples_per_second": 47.753, |
|
"eval_steps_per_second": 2.989, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.33748736948467495, |
|
"grad_norm": 0.6187270283699036, |
|
"learning_rate": 1.2570720675917734e-05, |
|
"loss": 0.3082, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.3381609969686763, |
|
"grad_norm": 0.5153407454490662, |
|
"learning_rate": 1.2557702334365916e-05, |
|
"loss": 0.26, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.33883462445267765, |
|
"grad_norm": 0.5447744727134705, |
|
"learning_rate": 1.2544655984286451e-05, |
|
"loss": 0.2641, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.339508251936679, |
|
"grad_norm": 0.5450101494789124, |
|
"learning_rate": 1.253158169792738e-05, |
|
"loss": 0.276, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.34018187942068034, |
|
"grad_norm": 0.6855320930480957, |
|
"learning_rate": 1.2518479547691437e-05, |
|
"loss": 0.3589, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.3408555069046817, |
|
"grad_norm": 0.52507483959198, |
|
"learning_rate": 1.250534960613567e-05, |
|
"loss": 0.2489, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.34152913438868304, |
|
"grad_norm": 0.5259436964988708, |
|
"learning_rate": 1.2492191945971028e-05, |
|
"loss": 0.2568, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.3422027618726844, |
|
"grad_norm": 0.5746189951896667, |
|
"learning_rate": 1.2479006640061958e-05, |
|
"loss": 0.2878, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.34287638935668574, |
|
"grad_norm": 0.5484218001365662, |
|
"learning_rate": 1.2465793761426005e-05, |
|
"loss": 0.3059, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.3435500168406871, |
|
"grad_norm": 0.5747763514518738, |
|
"learning_rate": 1.24525533832334e-05, |
|
"loss": 0.2505, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.34422364432468844, |
|
"grad_norm": 0.5692996382713318, |
|
"learning_rate": 1.2439285578806678e-05, |
|
"loss": 0.3077, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.3448972718086898, |
|
"grad_norm": 0.5282084345817566, |
|
"learning_rate": 1.2425990421620235e-05, |
|
"loss": 0.2763, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.34557089929269114, |
|
"grad_norm": 0.4825171232223511, |
|
"learning_rate": 1.241266798529995e-05, |
|
"loss": 0.2423, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.3462445267766925, |
|
"grad_norm": 0.5359032154083252, |
|
"learning_rate": 1.239931834362277e-05, |
|
"loss": 0.2796, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.34691815426069383, |
|
"grad_norm": 0.473827600479126, |
|
"learning_rate": 1.2385941570516297e-05, |
|
"loss": 0.2531, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.3475917817446952, |
|
"grad_norm": 0.4639384150505066, |
|
"learning_rate": 1.2372537740058382e-05, |
|
"loss": 0.2326, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.34826540922869653, |
|
"grad_norm": 0.5909863710403442, |
|
"learning_rate": 1.2359106926476714e-05, |
|
"loss": 0.2824, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.3489390367126979, |
|
"grad_norm": 0.5261175036430359, |
|
"learning_rate": 1.234564920414841e-05, |
|
"loss": 0.2757, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.34961266419669923, |
|
"grad_norm": 0.577748715877533, |
|
"learning_rate": 1.2332164647599599e-05, |
|
"loss": 0.2619, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.3502862916807006, |
|
"grad_norm": 0.5614107251167297, |
|
"learning_rate": 1.2318653331505015e-05, |
|
"loss": 0.2928, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.35095991916470193, |
|
"grad_norm": 0.5660324692726135, |
|
"learning_rate": 1.2305115330687585e-05, |
|
"loss": 0.2797, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.3516335466487033, |
|
"grad_norm": 0.5362821817398071, |
|
"learning_rate": 1.2291550720117997e-05, |
|
"loss": 0.2931, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.3523071741327046, |
|
"grad_norm": 0.5424318909645081, |
|
"learning_rate": 1.2277959574914317e-05, |
|
"loss": 0.2709, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.352980801616706, |
|
"grad_norm": 0.5283873081207275, |
|
"learning_rate": 1.226434197034154e-05, |
|
"loss": 0.2478, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.3536544291007073, |
|
"grad_norm": 0.5451403260231018, |
|
"learning_rate": 1.2250697981811195e-05, |
|
"loss": 0.2684, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.3543280565847087, |
|
"grad_norm": 0.5320309400558472, |
|
"learning_rate": 1.2237027684880914e-05, |
|
"loss": 0.2678, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.35500168406871, |
|
"grad_norm": 0.558335542678833, |
|
"learning_rate": 1.2223331155254026e-05, |
|
"loss": 0.2715, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.35567531155271137, |
|
"grad_norm": 0.5011473298072815, |
|
"learning_rate": 1.220960846877913e-05, |
|
"loss": 0.2535, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.3563489390367127, |
|
"grad_norm": 0.5432257056236267, |
|
"learning_rate": 1.2195859701449672e-05, |
|
"loss": 0.2802, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.35702256652071407, |
|
"grad_norm": 0.5836246013641357, |
|
"learning_rate": 1.2182084929403531e-05, |
|
"loss": 0.3088, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3576961940047154, |
|
"grad_norm": 0.5858445167541504, |
|
"learning_rate": 1.2168284228922597e-05, |
|
"loss": 0.2751, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.35836982148871677, |
|
"grad_norm": 0.556725800037384, |
|
"learning_rate": 1.2154457676432344e-05, |
|
"loss": 0.2693, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.35904344897271806, |
|
"grad_norm": 0.5822067260742188, |
|
"learning_rate": 1.2140605348501409e-05, |
|
"loss": 0.3145, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.3597170764567194, |
|
"grad_norm": 0.5754439830780029, |
|
"learning_rate": 1.212672732184117e-05, |
|
"loss": 0.3009, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.36039070394072076, |
|
"grad_norm": 0.5826534032821655, |
|
"learning_rate": 1.2112823673305317e-05, |
|
"loss": 0.3112, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.3610643314247221, |
|
"grad_norm": 0.5259435176849365, |
|
"learning_rate": 1.209889447988943e-05, |
|
"loss": 0.2572, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.36173795890872346, |
|
"grad_norm": 0.5303089022636414, |
|
"learning_rate": 1.2084939818730554e-05, |
|
"loss": 0.2745, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.3624115863927248, |
|
"grad_norm": 0.4945959150791168, |
|
"learning_rate": 1.2070959767106762e-05, |
|
"loss": 0.2624, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.36308521387672615, |
|
"grad_norm": 0.5212944149971008, |
|
"learning_rate": 1.2056954402436743e-05, |
|
"loss": 0.2367, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.3637588413607275, |
|
"grad_norm": 0.5474100708961487, |
|
"learning_rate": 1.2042923802279356e-05, |
|
"loss": 0.2922, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.36443246884472885, |
|
"grad_norm": 0.5586138963699341, |
|
"learning_rate": 1.2028868044333218e-05, |
|
"loss": 0.2779, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.3651060963287302, |
|
"grad_norm": 0.4587612450122833, |
|
"learning_rate": 1.2014787206436256e-05, |
|
"loss": 0.2291, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.36577972381273155, |
|
"grad_norm": 0.5979660749435425, |
|
"learning_rate": 1.200068136656529e-05, |
|
"loss": 0.2663, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.3664533512967329, |
|
"grad_norm": 0.5004269480705261, |
|
"learning_rate": 1.1986550602835595e-05, |
|
"loss": 0.2325, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.36712697878073425, |
|
"grad_norm": 0.5056456327438354, |
|
"learning_rate": 1.1972394993500466e-05, |
|
"loss": 0.2691, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.3678006062647356, |
|
"grad_norm": 0.5447576642036438, |
|
"learning_rate": 1.1958214616950794e-05, |
|
"loss": 0.272, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.36847423374873695, |
|
"grad_norm": 0.5720804929733276, |
|
"learning_rate": 1.1944009551714623e-05, |
|
"loss": 0.2651, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.3691478612327383, |
|
"grad_norm": 0.5342965722084045, |
|
"learning_rate": 1.1929779876456713e-05, |
|
"loss": 0.2681, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.36982148871673964, |
|
"grad_norm": 0.5355931520462036, |
|
"learning_rate": 1.191552566997812e-05, |
|
"loss": 0.2504, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.370495116200741, |
|
"grad_norm": 0.6217589378356934, |
|
"learning_rate": 1.1901247011215733e-05, |
|
"loss": 0.2704, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.37116874368474234, |
|
"grad_norm": 0.6108464002609253, |
|
"learning_rate": 1.1886943979241874e-05, |
|
"loss": 0.2995, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.3718423711687437, |
|
"grad_norm": 0.5349010229110718, |
|
"learning_rate": 1.187261665326382e-05, |
|
"loss": 0.2873, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.37251599865274504, |
|
"grad_norm": 0.5306320786476135, |
|
"learning_rate": 1.1858265112623388e-05, |
|
"loss": 0.2546, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.3731896261367464, |
|
"grad_norm": 0.5984854102134705, |
|
"learning_rate": 1.18438894367965e-05, |
|
"loss": 0.3019, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.37386325362074774, |
|
"grad_norm": 0.5498750805854797, |
|
"learning_rate": 1.1829489705392727e-05, |
|
"loss": 0.2702, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.3745368811047491, |
|
"grad_norm": 0.5973288416862488, |
|
"learning_rate": 1.1815065998154849e-05, |
|
"loss": 0.2947, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.37521050858875044, |
|
"grad_norm": 0.5865532755851746, |
|
"learning_rate": 1.180061839495843e-05, |
|
"loss": 0.3207, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.3758841360727518, |
|
"grad_norm": 0.5075846314430237, |
|
"learning_rate": 1.1786146975811359e-05, |
|
"loss": 0.2474, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.37655776355675313, |
|
"grad_norm": 0.5501227378845215, |
|
"learning_rate": 1.1771651820853417e-05, |
|
"loss": 0.274, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.3772313910407545, |
|
"grad_norm": 0.5292581915855408, |
|
"learning_rate": 1.1757133010355821e-05, |
|
"loss": 0.2546, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.37790501852475583, |
|
"grad_norm": 0.5926501750946045, |
|
"learning_rate": 1.1742590624720796e-05, |
|
"loss": 0.2847, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.3785786460087572, |
|
"grad_norm": 0.5264430046081543, |
|
"learning_rate": 1.1728024744481117e-05, |
|
"loss": 0.2634, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.37925227349275853, |
|
"grad_norm": 0.5014563798904419, |
|
"learning_rate": 1.171343545029967e-05, |
|
"loss": 0.2301, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.3799259009767599, |
|
"grad_norm": 0.48584073781967163, |
|
"learning_rate": 1.1698822822969001e-05, |
|
"loss": 0.2482, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.38059952846076117, |
|
"grad_norm": 0.5884197354316711, |
|
"learning_rate": 1.1684186943410867e-05, |
|
"loss": 0.286, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.3812731559447625, |
|
"grad_norm": 0.556430459022522, |
|
"learning_rate": 1.16695278926758e-05, |
|
"loss": 0.2496, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.38194678342876387, |
|
"grad_norm": 0.5392268300056458, |
|
"learning_rate": 1.165484575194264e-05, |
|
"loss": 0.2786, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.3826204109127652, |
|
"grad_norm": 0.5491148233413696, |
|
"learning_rate": 1.1640140602518102e-05, |
|
"loss": 0.2289, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.38329403839676657, |
|
"grad_norm": 0.5565954446792603, |
|
"learning_rate": 1.162541252583631e-05, |
|
"loss": 0.2614, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.3839676658807679, |
|
"grad_norm": 0.5307971239089966, |
|
"learning_rate": 1.1610661603458363e-05, |
|
"loss": 0.2577, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.38464129336476927, |
|
"grad_norm": 0.5446802377700806, |
|
"learning_rate": 1.159588791707187e-05, |
|
"loss": 0.292, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.3853149208487706, |
|
"grad_norm": 0.5837084054946899, |
|
"learning_rate": 1.1581091548490505e-05, |
|
"loss": 0.2771, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.38598854833277196, |
|
"grad_norm": 0.5611515045166016, |
|
"learning_rate": 1.156627257965355e-05, |
|
"loss": 0.2602, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.3866621758167733, |
|
"grad_norm": 0.5338358879089355, |
|
"learning_rate": 1.155143109262544e-05, |
|
"loss": 0.2573, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.38733580330077466, |
|
"grad_norm": 0.4791894853115082, |
|
"learning_rate": 1.1536567169595316e-05, |
|
"loss": 0.2411, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.388009430784776, |
|
"grad_norm": 0.5701311826705933, |
|
"learning_rate": 1.1521680892876563e-05, |
|
"loss": 0.2973, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.38868305826877736, |
|
"grad_norm": 0.4976153075695038, |
|
"learning_rate": 1.1506772344906356e-05, |
|
"loss": 0.2716, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.3893566857527787, |
|
"grad_norm": 0.5492983460426331, |
|
"learning_rate": 1.1491841608245204e-05, |
|
"loss": 0.2621, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.39003031323678006, |
|
"grad_norm": 0.5490813255310059, |
|
"learning_rate": 1.1476888765576493e-05, |
|
"loss": 0.2687, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.3907039407207814, |
|
"grad_norm": 0.5402075052261353, |
|
"learning_rate": 1.1461913899706025e-05, |
|
"loss": 0.3112, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.39137756820478276, |
|
"grad_norm": 0.5017600059509277, |
|
"learning_rate": 1.1446917093561564e-05, |
|
"loss": 0.2242, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.3920511956887841, |
|
"grad_norm": 0.5590758919715881, |
|
"learning_rate": 1.1431898430192375e-05, |
|
"loss": 0.2569, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.39272482317278545, |
|
"grad_norm": 0.5497624278068542, |
|
"learning_rate": 1.1416857992768764e-05, |
|
"loss": 0.3114, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.3933984506567868, |
|
"grad_norm": 0.5833696126937866, |
|
"learning_rate": 1.1401795864581616e-05, |
|
"loss": 0.2999, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.39407207814078815, |
|
"grad_norm": 0.5114924907684326, |
|
"learning_rate": 1.1386712129041937e-05, |
|
"loss": 0.2428, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.3947457056247895, |
|
"grad_norm": 0.5477609038352966, |
|
"learning_rate": 1.1371606869680388e-05, |
|
"loss": 0.2722, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.39541933310879085, |
|
"grad_norm": 0.5121515393257141, |
|
"learning_rate": 1.1356480170146826e-05, |
|
"loss": 0.2376, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.3960929605927922, |
|
"grad_norm": 0.502560019493103, |
|
"learning_rate": 1.1341332114209838e-05, |
|
"loss": 0.2737, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.39676658807679355, |
|
"grad_norm": 0.5239719748497009, |
|
"learning_rate": 1.1326162785756281e-05, |
|
"loss": 0.2563, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.3974402155607949, |
|
"grad_norm": 0.5645294189453125, |
|
"learning_rate": 1.131097226879081e-05, |
|
"loss": 0.308, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.39811384304479625, |
|
"grad_norm": 0.5425258278846741, |
|
"learning_rate": 1.1295760647435424e-05, |
|
"loss": 0.2388, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.3987874705287976, |
|
"grad_norm": 0.5374796390533447, |
|
"learning_rate": 1.1280528005928988e-05, |
|
"loss": 0.2774, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.39946109801279894, |
|
"grad_norm": 0.5628758072853088, |
|
"learning_rate": 1.1265274428626775e-05, |
|
"loss": 0.2689, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.4001347254968003, |
|
"grad_norm": 0.5226148366928101, |
|
"learning_rate": 1.125e-05, |
|
"loss": 0.2713, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.40080835298080164, |
|
"grad_norm": 0.5630069971084595, |
|
"learning_rate": 1.1234704804635342e-05, |
|
"loss": 0.3279, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.401481980464803, |
|
"grad_norm": 0.508704423904419, |
|
"learning_rate": 1.1219388927234482e-05, |
|
"loss": 0.2623, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.40215560794880434, |
|
"grad_norm": 0.5345742702484131, |
|
"learning_rate": 1.1204052452613638e-05, |
|
"loss": 0.2865, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.40282923543280563, |
|
"grad_norm": 0.5258358120918274, |
|
"learning_rate": 1.1188695465703092e-05, |
|
"loss": 0.2721, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.403502862916807, |
|
"grad_norm": 0.5306556820869446, |
|
"learning_rate": 1.1173318051546713e-05, |
|
"loss": 0.2753, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.40417649040080833, |
|
"grad_norm": 0.49859175086021423, |
|
"learning_rate": 1.1157920295301498e-05, |
|
"loss": 0.2594, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.40417649040080833, |
|
"eval_loss": 0.2652011811733246, |
|
"eval_runtime": 105.8884, |
|
"eval_samples_per_second": 47.22, |
|
"eval_steps_per_second": 2.956, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4048501178848097, |
|
"grad_norm": 0.558407723903656, |
|
"learning_rate": 1.114250228223709e-05, |
|
"loss": 0.256, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.40552374536881103, |
|
"grad_norm": 0.508040726184845, |
|
"learning_rate": 1.1127064097735315e-05, |
|
"loss": 0.2575, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.4061973728528124, |
|
"grad_norm": 0.5474634766578674, |
|
"learning_rate": 1.1111605827289698e-05, |
|
"loss": 0.2805, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.4068710003368137, |
|
"grad_norm": 0.519263505935669, |
|
"learning_rate": 1.1096127556505e-05, |
|
"loss": 0.2534, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.4075446278208151, |
|
"grad_norm": 0.5802994966506958, |
|
"learning_rate": 1.1080629371096738e-05, |
|
"loss": 0.2756, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.4082182553048164, |
|
"grad_norm": 0.5730322599411011, |
|
"learning_rate": 1.1065111356890712e-05, |
|
"loss": 0.2888, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.4088918827888178, |
|
"grad_norm": 0.5447918176651001, |
|
"learning_rate": 1.1049573599822537e-05, |
|
"loss": 0.2848, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.4095655102728191, |
|
"grad_norm": 0.5072281360626221, |
|
"learning_rate": 1.1034016185937149e-05, |
|
"loss": 0.2972, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.41023913775682047, |
|
"grad_norm": 0.6098499298095703, |
|
"learning_rate": 1.1018439201388346e-05, |
|
"loss": 0.299, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.4109127652408218, |
|
"grad_norm": 0.594445526599884, |
|
"learning_rate": 1.1002842732438301e-05, |
|
"loss": 0.2778, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.41158639272482317, |
|
"grad_norm": 0.5406931638717651, |
|
"learning_rate": 1.0987226865457091e-05, |
|
"loss": 0.2948, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.4122600202088245, |
|
"grad_norm": 0.5487210750579834, |
|
"learning_rate": 1.0971591686922211e-05, |
|
"loss": 0.256, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.41293364769282587, |
|
"grad_norm": 0.5063245296478271, |
|
"learning_rate": 1.0955937283418104e-05, |
|
"loss": 0.2481, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.4136072751768272, |
|
"grad_norm": 0.5232447981834412, |
|
"learning_rate": 1.0940263741635678e-05, |
|
"loss": 0.2436, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.41428090266082856, |
|
"grad_norm": 0.5449836254119873, |
|
"learning_rate": 1.092457114837182e-05, |
|
"loss": 0.2621, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.4149545301448299, |
|
"grad_norm": 0.5582854151725769, |
|
"learning_rate": 1.090885959052892e-05, |
|
"loss": 0.2885, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.41562815762883126, |
|
"grad_norm": 0.5433541536331177, |
|
"learning_rate": 1.0893129155114396e-05, |
|
"loss": 0.2659, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.4163017851128326, |
|
"grad_norm": 0.5937801599502563, |
|
"learning_rate": 1.0877379929240198e-05, |
|
"loss": 0.2968, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.41697541259683396, |
|
"grad_norm": 0.4904331564903259, |
|
"learning_rate": 1.0861612000122341e-05, |
|
"loss": 0.2508, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.4176490400808353, |
|
"grad_norm": 0.5370484590530396, |
|
"learning_rate": 1.0845825455080411e-05, |
|
"loss": 0.2564, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.41832266756483666, |
|
"grad_norm": 0.535376250743866, |
|
"learning_rate": 1.0830020381537088e-05, |
|
"loss": 0.2796, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.418996295048838, |
|
"grad_norm": 0.5508119463920593, |
|
"learning_rate": 1.0814196867017656e-05, |
|
"loss": 0.281, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.41966992253283936, |
|
"grad_norm": 0.525283694267273, |
|
"learning_rate": 1.079835499914952e-05, |
|
"loss": 0.2306, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.4203435500168407, |
|
"grad_norm": 0.5157189965248108, |
|
"learning_rate": 1.078249486566173e-05, |
|
"loss": 0.2679, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.42101717750084205, |
|
"grad_norm": 0.6008614301681519, |
|
"learning_rate": 1.0766616554384477e-05, |
|
"loss": 0.2815, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.4216908049848434, |
|
"grad_norm": 0.5147749185562134, |
|
"learning_rate": 1.0750720153248626e-05, |
|
"loss": 0.2587, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.42236443246884475, |
|
"grad_norm": 0.5508129596710205, |
|
"learning_rate": 1.073480575028521e-05, |
|
"loss": 0.2788, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.4230380599528461, |
|
"grad_norm": 0.5465036034584045, |
|
"learning_rate": 1.0718873433624966e-05, |
|
"loss": 0.2606, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.42371168743684745, |
|
"grad_norm": 0.5761625170707703, |
|
"learning_rate": 1.070292329149782e-05, |
|
"loss": 0.3149, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.42438531492084874, |
|
"grad_norm": 0.5194136500358582, |
|
"learning_rate": 1.0686955412232419e-05, |
|
"loss": 0.2305, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.4250589424048501, |
|
"grad_norm": 0.5823161602020264, |
|
"learning_rate": 1.0670969884255636e-05, |
|
"loss": 0.2495, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.42573256988885144, |
|
"grad_norm": 0.5550847053527832, |
|
"learning_rate": 1.0654966796092073e-05, |
|
"loss": 0.2539, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.4264061973728528, |
|
"grad_norm": 0.5327949523925781, |
|
"learning_rate": 1.0638946236363578e-05, |
|
"loss": 0.2655, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.42707982485685414, |
|
"grad_norm": 0.5146956443786621, |
|
"learning_rate": 1.0622908293788758e-05, |
|
"loss": 0.2599, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.4277534523408555, |
|
"grad_norm": 0.5790160894393921, |
|
"learning_rate": 1.0606853057182481e-05, |
|
"loss": 0.298, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.42842707982485684, |
|
"grad_norm": 0.5627730488777161, |
|
"learning_rate": 1.059078061545538e-05, |
|
"loss": 0.2622, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.4291007073088582, |
|
"grad_norm": 0.619365394115448, |
|
"learning_rate": 1.0574691057613376e-05, |
|
"loss": 0.2905, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.42977433479285954, |
|
"grad_norm": 0.5521032810211182, |
|
"learning_rate": 1.0558584472757167e-05, |
|
"loss": 0.2705, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.4304479622768609, |
|
"grad_norm": 0.5045711398124695, |
|
"learning_rate": 1.0542460950081747e-05, |
|
"loss": 0.2289, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.43112158976086223, |
|
"grad_norm": 0.5129411816596985, |
|
"learning_rate": 1.0526320578875909e-05, |
|
"loss": 0.2572, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.4317952172448636, |
|
"grad_norm": 0.5294272899627686, |
|
"learning_rate": 1.0510163448521747e-05, |
|
"loss": 0.2702, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.43246884472886493, |
|
"grad_norm": 0.5448393225669861, |
|
"learning_rate": 1.0493989648494165e-05, |
|
"loss": 0.2808, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.4331424722128663, |
|
"grad_norm": 0.5107436776161194, |
|
"learning_rate": 1.0477799268360384e-05, |
|
"loss": 0.248, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.43381609969686763, |
|
"grad_norm": 0.5598347187042236, |
|
"learning_rate": 1.0461592397779435e-05, |
|
"loss": 0.2342, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.434489727180869, |
|
"grad_norm": 0.5707139372825623, |
|
"learning_rate": 1.0445369126501676e-05, |
|
"loss": 0.2764, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.4351633546648703, |
|
"grad_norm": 0.48345211148262024, |
|
"learning_rate": 1.0429129544368283e-05, |
|
"loss": 0.2215, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.4358369821488717, |
|
"grad_norm": 0.5131022930145264, |
|
"learning_rate": 1.0412873741310763e-05, |
|
"loss": 0.2423, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.436510609632873, |
|
"grad_norm": 0.5428949594497681, |
|
"learning_rate": 1.0396601807350452e-05, |
|
"loss": 0.2331, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.4371842371168744, |
|
"grad_norm": 0.47753867506980896, |
|
"learning_rate": 1.038031383259801e-05, |
|
"loss": 0.2552, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.4378578646008757, |
|
"grad_norm": 0.48779332637786865, |
|
"learning_rate": 1.0364009907252937e-05, |
|
"loss": 0.2499, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.4385314920848771, |
|
"grad_norm": 0.4910006523132324, |
|
"learning_rate": 1.0347690121603047e-05, |
|
"loss": 0.2498, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.4392051195688784, |
|
"grad_norm": 0.5575456023216248, |
|
"learning_rate": 1.0331354566024005e-05, |
|
"loss": 0.2503, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.43987874705287977, |
|
"grad_norm": 0.5806515216827393, |
|
"learning_rate": 1.0315003330978799e-05, |
|
"loss": 0.254, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.4405523745368811, |
|
"grad_norm": 0.5564923882484436, |
|
"learning_rate": 1.0298636507017241e-05, |
|
"loss": 0.2804, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.44122600202088247, |
|
"grad_norm": 0.5716164708137512, |
|
"learning_rate": 1.0282254184775473e-05, |
|
"loss": 0.2844, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.4418996295048838, |
|
"grad_norm": 0.5606719255447388, |
|
"learning_rate": 1.0265856454975473e-05, |
|
"loss": 0.2576, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.44257325698888517, |
|
"grad_norm": 0.5467285513877869, |
|
"learning_rate": 1.0249443408424535e-05, |
|
"loss": 0.2782, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.4432468844728865, |
|
"grad_norm": 0.569665253162384, |
|
"learning_rate": 1.0233015136014773e-05, |
|
"loss": 0.272, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.44392051195688786, |
|
"grad_norm": 0.5965842604637146, |
|
"learning_rate": 1.021657172872262e-05, |
|
"loss": 0.3023, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.4445941394408892, |
|
"grad_norm": 0.5759636163711548, |
|
"learning_rate": 1.0200113277608326e-05, |
|
"loss": 0.2621, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.44526776692489056, |
|
"grad_norm": 0.5999960899353027, |
|
"learning_rate": 1.0183639873815448e-05, |
|
"loss": 0.2976, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.44594139440889186, |
|
"grad_norm": 0.5440315008163452, |
|
"learning_rate": 1.0167151608570346e-05, |
|
"loss": 0.2889, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.4466150218928932, |
|
"grad_norm": 0.4932374358177185, |
|
"learning_rate": 1.0150648573181685e-05, |
|
"loss": 0.2271, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.44728864937689455, |
|
"grad_norm": 0.5871284604072571, |
|
"learning_rate": 1.0134130859039921e-05, |
|
"loss": 0.3202, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.4479622768608959, |
|
"grad_norm": 0.5287674069404602, |
|
"learning_rate": 1.0117598557616796e-05, |
|
"loss": 0.2486, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.44863590434489725, |
|
"grad_norm": 0.588444709777832, |
|
"learning_rate": 1.0101051760464837e-05, |
|
"loss": 0.2555, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.4493095318288986, |
|
"grad_norm": 0.5376453399658203, |
|
"learning_rate": 1.0084490559216843e-05, |
|
"loss": 0.2506, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.44998315931289995, |
|
"grad_norm": 0.5496957898139954, |
|
"learning_rate": 1.006791504558538e-05, |
|
"loss": 0.2616, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.4506567867969013, |
|
"grad_norm": 0.523008406162262, |
|
"learning_rate": 1.0051325311362278e-05, |
|
"loss": 0.2597, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.45133041428090265, |
|
"grad_norm": 0.5686816573143005, |
|
"learning_rate": 1.0034721448418105e-05, |
|
"loss": 0.2665, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.452004041764904, |
|
"grad_norm": 0.5065593719482422, |
|
"learning_rate": 1.0018103548701688e-05, |
|
"loss": 0.2566, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.45267766924890535, |
|
"grad_norm": 0.5687103867530823, |
|
"learning_rate": 1.0001471704239577e-05, |
|
"loss": 0.2628, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.4533512967329067, |
|
"grad_norm": 0.5782075524330139, |
|
"learning_rate": 9.984826007135544e-06, |
|
"loss": 0.2732, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.45402492421690804, |
|
"grad_norm": 0.5679803490638733, |
|
"learning_rate": 9.968166549570075e-06, |
|
"loss": 0.2664, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.4546985517009094, |
|
"grad_norm": 0.5293748378753662, |
|
"learning_rate": 9.951493423799866e-06, |
|
"loss": 0.2498, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.45537217918491074, |
|
"grad_norm": 0.5444015264511108, |
|
"learning_rate": 9.934806722157294e-06, |
|
"loss": 0.2549, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.4560458066689121, |
|
"grad_norm": 0.5367648601531982, |
|
"learning_rate": 9.918106537049921e-06, |
|
"loss": 0.2623, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.45671943415291344, |
|
"grad_norm": 0.5820662975311279, |
|
"learning_rate": 9.901392960959983e-06, |
|
"loss": 0.2771, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.4573930616369148, |
|
"grad_norm": 0.5573861598968506, |
|
"learning_rate": 9.884666086443862e-06, |
|
"loss": 0.2614, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.45806668912091614, |
|
"grad_norm": 0.6296043992042542, |
|
"learning_rate": 9.867926006131597e-06, |
|
"loss": 0.3102, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.4587403166049175, |
|
"grad_norm": 0.5795363187789917, |
|
"learning_rate": 9.851172812726344e-06, |
|
"loss": 0.3059, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.45941394408891884, |
|
"grad_norm": 0.48046785593032837, |
|
"learning_rate": 9.834406599003885e-06, |
|
"loss": 0.2323, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.4600875715729202, |
|
"grad_norm": 0.4878872036933899, |
|
"learning_rate": 9.817627457812105e-06, |
|
"loss": 0.2467, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.46076119905692153, |
|
"grad_norm": 0.5333375334739685, |
|
"learning_rate": 9.800835482070479e-06, |
|
"loss": 0.2282, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.4614348265409229, |
|
"grad_norm": 0.543725848197937, |
|
"learning_rate": 9.784030764769553e-06, |
|
"loss": 0.2427, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.46210845402492423, |
|
"grad_norm": 0.5145445466041565, |
|
"learning_rate": 9.76721339897044e-06, |
|
"loss": 0.2291, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.4627820815089256, |
|
"grad_norm": 0.5099066495895386, |
|
"learning_rate": 9.75038347780429e-06, |
|
"loss": 0.245, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.46345570899292693, |
|
"grad_norm": 0.5599386096000671, |
|
"learning_rate": 9.73354109447179e-06, |
|
"loss": 0.2994, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.4641293364769283, |
|
"grad_norm": 0.5298258662223816, |
|
"learning_rate": 9.716686342242632e-06, |
|
"loss": 0.231, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.4648029639609296, |
|
"grad_norm": 0.5349884033203125, |
|
"learning_rate": 9.69981931445501e-06, |
|
"loss": 0.2436, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.465476591444931, |
|
"grad_norm": 0.5078858137130737, |
|
"learning_rate": 9.682940104515097e-06, |
|
"loss": 0.2735, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.4661502189289323, |
|
"grad_norm": 0.5433405637741089, |
|
"learning_rate": 9.666048805896524e-06, |
|
"loss": 0.2472, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.4668238464129337, |
|
"grad_norm": 0.5337989926338196, |
|
"learning_rate": 9.649145512139876e-06, |
|
"loss": 0.2815, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.46749747389693497, |
|
"grad_norm": 0.491817831993103, |
|
"learning_rate": 9.632230316852153e-06, |
|
"loss": 0.2712, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.4681711013809363, |
|
"grad_norm": 0.5814330577850342, |
|
"learning_rate": 9.615303313706271e-06, |
|
"loss": 0.2931, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.46884472886493767, |
|
"grad_norm": 0.5358330607414246, |
|
"learning_rate": 9.598364596440534e-06, |
|
"loss": 0.2546, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.469518356348939, |
|
"grad_norm": 0.5111145377159119, |
|
"learning_rate": 9.581414258858116e-06, |
|
"loss": 0.2607, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.47019198383294036, |
|
"grad_norm": 0.5266521573066711, |
|
"learning_rate": 9.564452394826538e-06, |
|
"loss": 0.2554, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.4708656113169417, |
|
"grad_norm": 0.5091780424118042, |
|
"learning_rate": 9.54747909827716e-06, |
|
"loss": 0.2723, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.47153923880094306, |
|
"grad_norm": 0.5414915680885315, |
|
"learning_rate": 9.530494463204646e-06, |
|
"loss": 0.2577, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.47153923880094306, |
|
"eval_loss": 0.26179420948028564, |
|
"eval_runtime": 105.0708, |
|
"eval_samples_per_second": 47.587, |
|
"eval_steps_per_second": 2.979, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4722128662849444, |
|
"grad_norm": 0.505789577960968, |
|
"learning_rate": 9.513498583666456e-06, |
|
"loss": 0.2448, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.47288649376894576, |
|
"grad_norm": 0.46454617381095886, |
|
"learning_rate": 9.496491553782314e-06, |
|
"loss": 0.221, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.4735601212529471, |
|
"grad_norm": 0.5358849763870239, |
|
"learning_rate": 9.479473467733697e-06, |
|
"loss": 0.2872, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.47423374873694846, |
|
"grad_norm": 0.5496987700462341, |
|
"learning_rate": 9.462444419763306e-06, |
|
"loss": 0.2464, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.4749073762209498, |
|
"grad_norm": 0.5485591292381287, |
|
"learning_rate": 9.445404504174546e-06, |
|
"loss": 0.2695, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.47558100370495116, |
|
"grad_norm": 0.5437228679656982, |
|
"learning_rate": 9.42835381533101e-06, |
|
"loss": 0.2823, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.4762546311889525, |
|
"grad_norm": 0.5094515085220337, |
|
"learning_rate": 9.411292447655948e-06, |
|
"loss": 0.2401, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.47692825867295385, |
|
"grad_norm": 0.5395442843437195, |
|
"learning_rate": 9.394220495631744e-06, |
|
"loss": 0.2659, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.4776018861569552, |
|
"grad_norm": 0.4930800795555115, |
|
"learning_rate": 9.377138053799399e-06, |
|
"loss": 0.2383, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.47827551364095655, |
|
"grad_norm": 0.5237337350845337, |
|
"learning_rate": 9.360045216758008e-06, |
|
"loss": 0.2527, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.4789491411249579, |
|
"grad_norm": 0.5243161916732788, |
|
"learning_rate": 9.342942079164223e-06, |
|
"loss": 0.2515, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.47962276860895925, |
|
"grad_norm": 0.5414012670516968, |
|
"learning_rate": 9.325828735731747e-06, |
|
"loss": 0.275, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.4802963960929606, |
|
"grad_norm": 0.547073245048523, |
|
"learning_rate": 9.308705281230796e-06, |
|
"loss": 0.276, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.48097002357696195, |
|
"grad_norm": 0.49008458852767944, |
|
"learning_rate": 9.291571810487584e-06, |
|
"loss": 0.246, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.4816436510609633, |
|
"grad_norm": 0.5415433645248413, |
|
"learning_rate": 9.27442841838379e-06, |
|
"loss": 0.2658, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.48231727854496464, |
|
"grad_norm": 0.5856931209564209, |
|
"learning_rate": 9.257275199856032e-06, |
|
"loss": 0.2675, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.482990906028966, |
|
"grad_norm": 0.5154370665550232, |
|
"learning_rate": 9.24011224989535e-06, |
|
"loss": 0.2422, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.48366453351296734, |
|
"grad_norm": 0.5306107401847839, |
|
"learning_rate": 9.222939663546677e-06, |
|
"loss": 0.2687, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.4843381609969687, |
|
"grad_norm": 0.4880635142326355, |
|
"learning_rate": 9.2057575359083e-06, |
|
"loss": 0.2276, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.48501178848097004, |
|
"grad_norm": 0.6055603623390198, |
|
"learning_rate": 9.18856596213135e-06, |
|
"loss": 0.2907, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4856854159649714, |
|
"grad_norm": 0.5602757930755615, |
|
"learning_rate": 9.171365037419272e-06, |
|
"loss": 0.2511, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.48635904344897274, |
|
"grad_norm": 0.5492405295372009, |
|
"learning_rate": 9.15415485702729e-06, |
|
"loss": 0.246, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.4870326709329741, |
|
"grad_norm": 0.6091371178627014, |
|
"learning_rate": 9.136935516261887e-06, |
|
"loss": 0.3003, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.48770629841697544, |
|
"grad_norm": 0.5400590300559998, |
|
"learning_rate": 9.119707110480272e-06, |
|
"loss": 0.2576, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.4883799259009768, |
|
"grad_norm": 0.5183984041213989, |
|
"learning_rate": 9.10246973508985e-06, |
|
"loss": 0.2519, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.48905355338497813, |
|
"grad_norm": 0.5791885256767273, |
|
"learning_rate": 9.08522348554771e-06, |
|
"loss": 0.269, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.48972718086897943, |
|
"grad_norm": 0.5196906328201294, |
|
"learning_rate": 9.067968457360073e-06, |
|
"loss": 0.2681, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.4904008083529808, |
|
"grad_norm": 0.5393977165222168, |
|
"learning_rate": 9.050704746081779e-06, |
|
"loss": 0.2487, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.4910744358369821, |
|
"grad_norm": 0.5441868305206299, |
|
"learning_rate": 9.033432447315751e-06, |
|
"loss": 0.2603, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.4917480633209835, |
|
"grad_norm": 0.4999203383922577, |
|
"learning_rate": 9.016151656712473e-06, |
|
"loss": 0.2569, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.4924216908049848, |
|
"grad_norm": 0.5059922933578491, |
|
"learning_rate": 8.998862469969452e-06, |
|
"loss": 0.2428, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.4930953182889862, |
|
"grad_norm": 0.5794141292572021, |
|
"learning_rate": 8.981564982830683e-06, |
|
"loss": 0.2901, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.4937689457729875, |
|
"grad_norm": 0.5344904065132141, |
|
"learning_rate": 8.964259291086141e-06, |
|
"loss": 0.278, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.49444257325698887, |
|
"grad_norm": 0.5577378273010254, |
|
"learning_rate": 8.946945490571227e-06, |
|
"loss": 0.2753, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.4951162007409902, |
|
"grad_norm": 0.48888590931892395, |
|
"learning_rate": 8.92962367716625e-06, |
|
"loss": 0.2565, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.49578982822499157, |
|
"grad_norm": 0.5605798363685608, |
|
"learning_rate": 8.912293946795895e-06, |
|
"loss": 0.274, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.4964634557089929, |
|
"grad_norm": 0.5351974964141846, |
|
"learning_rate": 8.894956395428685e-06, |
|
"loss": 0.259, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.49713708319299427, |
|
"grad_norm": 0.530037522315979, |
|
"learning_rate": 8.877611119076454e-06, |
|
"loss": 0.2468, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.4978107106769956, |
|
"grad_norm": 0.5955355763435364, |
|
"learning_rate": 8.860258213793819e-06, |
|
"loss": 0.2702, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.49848433816099696, |
|
"grad_norm": 0.5594556927680969, |
|
"learning_rate": 8.842897775677645e-06, |
|
"loss": 0.2796, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.4991579656449983, |
|
"grad_norm": 0.5318235158920288, |
|
"learning_rate": 8.825529900866507e-06, |
|
"loss": 0.2721, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.49983159312899966, |
|
"grad_norm": 0.6066297888755798, |
|
"learning_rate": 8.808154685540164e-06, |
|
"loss": 0.2814, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.500505220613001, |
|
"grad_norm": 0.520949125289917, |
|
"learning_rate": 8.790772225919031e-06, |
|
"loss": 0.2479, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.5011788480970023, |
|
"grad_norm": 0.532832682132721, |
|
"learning_rate": 8.77338261826364e-06, |
|
"loss": 0.2717, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.5018524755810037, |
|
"grad_norm": 0.4917290210723877, |
|
"learning_rate": 8.755985958874096e-06, |
|
"loss": 0.2331, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.502526103065005, |
|
"grad_norm": 0.6336959004402161, |
|
"learning_rate": 8.73858234408957e-06, |
|
"loss": 0.3059, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.5031997305490064, |
|
"grad_norm": 0.5722649693489075, |
|
"learning_rate": 8.72117187028774e-06, |
|
"loss": 0.2682, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.5038733580330077, |
|
"grad_norm": 0.47712576389312744, |
|
"learning_rate": 8.70375463388427e-06, |
|
"loss": 0.2468, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.504546985517009, |
|
"grad_norm": 0.49866771697998047, |
|
"learning_rate": 8.68633073133228e-06, |
|
"loss": 0.2609, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.5052206130010104, |
|
"grad_norm": 0.5410306453704834, |
|
"learning_rate": 8.6689002591218e-06, |
|
"loss": 0.2733, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5058942404850117, |
|
"grad_norm": 0.5518447160720825, |
|
"learning_rate": 8.651463313779241e-06, |
|
"loss": 0.2525, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.5065678679690131, |
|
"grad_norm": 0.5311466455459595, |
|
"learning_rate": 8.634019991866863e-06, |
|
"loss": 0.275, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.5072414954530144, |
|
"grad_norm": 0.5381631255149841, |
|
"learning_rate": 8.61657038998224e-06, |
|
"loss": 0.275, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.5079151229370158, |
|
"grad_norm": 0.48526835441589355, |
|
"learning_rate": 8.599114604757716e-06, |
|
"loss": 0.2431, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.5085887504210171, |
|
"grad_norm": 0.5347431302070618, |
|
"learning_rate": 8.581652732859887e-06, |
|
"loss": 0.2731, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.5092623779050185, |
|
"grad_norm": 0.5098583102226257, |
|
"learning_rate": 8.56418487098905e-06, |
|
"loss": 0.294, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.5099360053890198, |
|
"grad_norm": 0.499496191740036, |
|
"learning_rate": 8.54671111587867e-06, |
|
"loss": 0.2294, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.5106096328730212, |
|
"grad_norm": 0.5586072206497192, |
|
"learning_rate": 8.529231564294858e-06, |
|
"loss": 0.2506, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.5112832603570225, |
|
"grad_norm": 0.5203363299369812, |
|
"learning_rate": 8.51174631303581e-06, |
|
"loss": 0.2505, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.5119568878410239, |
|
"grad_norm": 0.5142697095870972, |
|
"learning_rate": 8.494255458931304e-06, |
|
"loss": 0.2456, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5126305153250252, |
|
"grad_norm": 0.4652908444404602, |
|
"learning_rate": 8.476759098842129e-06, |
|
"loss": 0.2085, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.5133041428090266, |
|
"grad_norm": 0.5014703273773193, |
|
"learning_rate": 8.459257329659571e-06, |
|
"loss": 0.239, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.5139777702930279, |
|
"grad_norm": 0.5147262215614319, |
|
"learning_rate": 8.441750248304872e-06, |
|
"loss": 0.2727, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.5146513977770293, |
|
"grad_norm": 0.564335823059082, |
|
"learning_rate": 8.424237951728689e-06, |
|
"loss": 0.2983, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.5153250252610306, |
|
"grad_norm": 0.5217107534408569, |
|
"learning_rate": 8.406720536910568e-06, |
|
"loss": 0.238, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.515998652745032, |
|
"grad_norm": 0.529780924320221, |
|
"learning_rate": 8.389198100858385e-06, |
|
"loss": 0.271, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.5166722802290333, |
|
"grad_norm": 0.5005664229393005, |
|
"learning_rate": 8.371670740607833e-06, |
|
"loss": 0.265, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.5173459077130347, |
|
"grad_norm": 0.4695169925689697, |
|
"learning_rate": 8.354138553221869e-06, |
|
"loss": 0.225, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.518019535197036, |
|
"grad_norm": 0.6260945200920105, |
|
"learning_rate": 8.336601635790184e-06, |
|
"loss": 0.2725, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.5186931626810374, |
|
"grad_norm": 0.5363501310348511, |
|
"learning_rate": 8.319060085428664e-06, |
|
"loss": 0.2631, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5193667901650387, |
|
"grad_norm": 0.5340143442153931, |
|
"learning_rate": 8.301513999278851e-06, |
|
"loss": 0.2829, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.5200404176490401, |
|
"grad_norm": 0.5355620384216309, |
|
"learning_rate": 8.283963474507402e-06, |
|
"loss": 0.2675, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.5207140451330414, |
|
"grad_norm": 0.5030906796455383, |
|
"learning_rate": 8.266408608305555e-06, |
|
"loss": 0.2243, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.5213876726170428, |
|
"grad_norm": 0.5517938137054443, |
|
"learning_rate": 8.248849497888598e-06, |
|
"loss": 0.2554, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.5220613001010441, |
|
"grad_norm": 0.47788354754447937, |
|
"learning_rate": 8.231286240495305e-06, |
|
"loss": 0.2258, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5227349275850455, |
|
"grad_norm": 0.550268828868866, |
|
"learning_rate": 8.213718933387438e-06, |
|
"loss": 0.2586, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.5234085550690468, |
|
"grad_norm": 0.5247451066970825, |
|
"learning_rate": 8.196147673849165e-06, |
|
"loss": 0.2491, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.5240821825530482, |
|
"grad_norm": 0.49666067957878113, |
|
"learning_rate": 8.17857255918655e-06, |
|
"loss": 0.2501, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.5247558100370495, |
|
"grad_norm": 0.5575336217880249, |
|
"learning_rate": 8.160993686727015e-06, |
|
"loss": 0.3047, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.5254294375210509, |
|
"grad_norm": 0.5327598452568054, |
|
"learning_rate": 8.143411153818773e-06, |
|
"loss": 0.289, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5261030650050522, |
|
"grad_norm": 0.4978947043418884, |
|
"learning_rate": 8.125825057830323e-06, |
|
"loss": 0.2817, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.5267766924890536, |
|
"grad_norm": 0.5068449378013611, |
|
"learning_rate": 8.108235496149892e-06, |
|
"loss": 0.2549, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.5274503199730549, |
|
"grad_norm": 0.5815426111221313, |
|
"learning_rate": 8.090642566184896e-06, |
|
"loss": 0.3215, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.5281239474570563, |
|
"grad_norm": 0.528716504573822, |
|
"learning_rate": 8.073046365361404e-06, |
|
"loss": 0.2405, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.5287975749410576, |
|
"grad_norm": 0.5129048824310303, |
|
"learning_rate": 8.0554469911236e-06, |
|
"loss": 0.2696, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.529471202425059, |
|
"grad_norm": 0.5234351754188538, |
|
"learning_rate": 8.037844540933245e-06, |
|
"loss": 0.2608, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.5301448299090603, |
|
"grad_norm": 0.531194269657135, |
|
"learning_rate": 8.020239112269131e-06, |
|
"loss": 0.2826, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.5308184573930617, |
|
"grad_norm": 0.5546161532402039, |
|
"learning_rate": 8.002630802626538e-06, |
|
"loss": 0.2635, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.531492084877063, |
|
"grad_norm": 0.5576707124710083, |
|
"learning_rate": 7.985019709516714e-06, |
|
"loss": 0.2591, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.5321657123610644, |
|
"grad_norm": 0.5075989961624146, |
|
"learning_rate": 7.967405930466305e-06, |
|
"loss": 0.2751, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5328393398450657, |
|
"grad_norm": 0.547538161277771, |
|
"learning_rate": 7.94978956301685e-06, |
|
"loss": 0.2767, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.5335129673290671, |
|
"grad_norm": 0.6105408072471619, |
|
"learning_rate": 7.932170704724202e-06, |
|
"loss": 0.3202, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.5341865948130684, |
|
"grad_norm": 0.517285943031311, |
|
"learning_rate": 7.914549453158025e-06, |
|
"loss": 0.2497, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.5348602222970698, |
|
"grad_norm": 0.5324558615684509, |
|
"learning_rate": 7.896925905901223e-06, |
|
"loss": 0.2804, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.5355338497810711, |
|
"grad_norm": 0.5467241406440735, |
|
"learning_rate": 7.879300160549423e-06, |
|
"loss": 0.274, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.5362074772650725, |
|
"grad_norm": 0.5673408508300781, |
|
"learning_rate": 7.86167231471042e-06, |
|
"loss": 0.2681, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.5368811047490738, |
|
"grad_norm": 0.5435929298400879, |
|
"learning_rate": 7.844042466003643e-06, |
|
"loss": 0.2456, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.5375547322330751, |
|
"grad_norm": 0.5365129113197327, |
|
"learning_rate": 7.826410712059607e-06, |
|
"loss": 0.2433, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.5382283597170765, |
|
"grad_norm": 0.556115984916687, |
|
"learning_rate": 7.808777150519384e-06, |
|
"loss": 0.2723, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.5389019872010778, |
|
"grad_norm": 0.6075104475021362, |
|
"learning_rate": 7.791141879034055e-06, |
|
"loss": 0.3197, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5389019872010778, |
|
"eval_loss": 0.25853946805000305, |
|
"eval_runtime": 105.3349, |
|
"eval_samples_per_second": 47.468, |
|
"eval_steps_per_second": 2.971, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5395756146850792, |
|
"grad_norm": 0.5173077583312988, |
|
"learning_rate": 7.773504995264167e-06, |
|
"loss": 0.2458, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.5402492421690805, |
|
"grad_norm": 0.5317369699478149, |
|
"learning_rate": 7.755866596879203e-06, |
|
"loss": 0.2535, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.5409228696530819, |
|
"grad_norm": 0.5028438568115234, |
|
"learning_rate": 7.738226781557024e-06, |
|
"loss": 0.2558, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.5415964971370832, |
|
"grad_norm": 0.4917846918106079, |
|
"learning_rate": 7.720585646983346e-06, |
|
"loss": 0.2567, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.5422701246210846, |
|
"grad_norm": 0.5413616299629211, |
|
"learning_rate": 7.702943290851183e-06, |
|
"loss": 0.3068, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.5429437521050859, |
|
"grad_norm": 0.5557405352592468, |
|
"learning_rate": 7.685299810860319e-06, |
|
"loss": 0.2807, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.5436173795890872, |
|
"grad_norm": 0.5536317229270935, |
|
"learning_rate": 7.667655304716762e-06, |
|
"loss": 0.2535, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.5442910070730885, |
|
"grad_norm": 0.6285427808761597, |
|
"learning_rate": 7.650009870132202e-06, |
|
"loss": 0.2687, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.5449646345570899, |
|
"grad_norm": 0.5142940282821655, |
|
"learning_rate": 7.632363604823466e-06, |
|
"loss": 0.2328, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.5456382620410912, |
|
"grad_norm": 0.5419033765792847, |
|
"learning_rate": 7.614716606511986e-06, |
|
"loss": 0.2687, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.5463118895250926, |
|
"grad_norm": 0.5078312158584595, |
|
"learning_rate": 7.597068972923254e-06, |
|
"loss": 0.2429, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.5469855170090939, |
|
"grad_norm": 0.5140127539634705, |
|
"learning_rate": 7.579420801786278e-06, |
|
"loss": 0.2358, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.5476591444930953, |
|
"grad_norm": 0.5336434841156006, |
|
"learning_rate": 7.561772190833041e-06, |
|
"loss": 0.2561, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.5483327719770966, |
|
"grad_norm": 0.4892539978027344, |
|
"learning_rate": 7.544123237797967e-06, |
|
"loss": 0.2447, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.549006399461098, |
|
"grad_norm": 0.5128865838050842, |
|
"learning_rate": 7.526474040417368e-06, |
|
"loss": 0.2305, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.5496800269450993, |
|
"grad_norm": 0.5284186601638794, |
|
"learning_rate": 7.508824696428914e-06, |
|
"loss": 0.2665, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.5503536544291007, |
|
"grad_norm": 0.49982714653015137, |
|
"learning_rate": 7.491175303571087e-06, |
|
"loss": 0.2361, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.551027281913102, |
|
"grad_norm": 0.5274138450622559, |
|
"learning_rate": 7.473525959582631e-06, |
|
"loss": 0.2542, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.5517009093971034, |
|
"grad_norm": 0.5714825987815857, |
|
"learning_rate": 7.4558767622020345e-06, |
|
"loss": 0.287, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.5523745368811047, |
|
"grad_norm": 0.5137256979942322, |
|
"learning_rate": 7.438227809166959e-06, |
|
"loss": 0.2416, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5530481643651061, |
|
"grad_norm": 0.5832123756408691, |
|
"learning_rate": 7.4205791982137215e-06, |
|
"loss": 0.2589, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.5537217918491074, |
|
"grad_norm": 0.6384348273277283, |
|
"learning_rate": 7.402931027076746e-06, |
|
"loss": 0.3011, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.5543954193331088, |
|
"grad_norm": 0.5485447645187378, |
|
"learning_rate": 7.385283393488017e-06, |
|
"loss": 0.2596, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.5550690468171101, |
|
"grad_norm": 0.5725424885749817, |
|
"learning_rate": 7.367636395176536e-06, |
|
"loss": 0.278, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.5557426743011115, |
|
"grad_norm": 0.49892446398735046, |
|
"learning_rate": 7.349990129867802e-06, |
|
"loss": 0.2308, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.5564163017851128, |
|
"grad_norm": 0.5304402709007263, |
|
"learning_rate": 7.332344695283239e-06, |
|
"loss": 0.2661, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.5570899292691142, |
|
"grad_norm": 0.5314590334892273, |
|
"learning_rate": 7.314700189139683e-06, |
|
"loss": 0.2545, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.5577635567531155, |
|
"grad_norm": 0.5156052112579346, |
|
"learning_rate": 7.297056709148819e-06, |
|
"loss": 0.2513, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.5584371842371169, |
|
"grad_norm": 0.5569677352905273, |
|
"learning_rate": 7.279414353016655e-06, |
|
"loss": 0.2701, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.5591108117211182, |
|
"grad_norm": 0.5068705081939697, |
|
"learning_rate": 7.261773218442978e-06, |
|
"loss": 0.2578, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.5597844392051196, |
|
"grad_norm": 0.5413905382156372, |
|
"learning_rate": 7.244133403120797e-06, |
|
"loss": 0.2657, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.5604580666891209, |
|
"grad_norm": 0.5509982109069824, |
|
"learning_rate": 7.226495004735833e-06, |
|
"loss": 0.2421, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.5611316941731223, |
|
"grad_norm": 0.5037456750869751, |
|
"learning_rate": 7.208858120965949e-06, |
|
"loss": 0.2366, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.5618053216571236, |
|
"grad_norm": 0.45753926038742065, |
|
"learning_rate": 7.191222849480618e-06, |
|
"loss": 0.2295, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.562478949141125, |
|
"grad_norm": 0.5005747079849243, |
|
"learning_rate": 7.1735892879403955e-06, |
|
"loss": 0.2431, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.5631525766251263, |
|
"grad_norm": 0.6139580607414246, |
|
"learning_rate": 7.155957533996361e-06, |
|
"loss": 0.2954, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.5638262041091276, |
|
"grad_norm": 0.4900098443031311, |
|
"learning_rate": 7.1383276852895805e-06, |
|
"loss": 0.2472, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.564499831593129, |
|
"grad_norm": 0.5588510632514954, |
|
"learning_rate": 7.120699839450578e-06, |
|
"loss": 0.2963, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.5651734590771303, |
|
"grad_norm": 0.45477819442749023, |
|
"learning_rate": 7.103074094098776e-06, |
|
"loss": 0.2459, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.5658470865611317, |
|
"grad_norm": 0.5369901061058044, |
|
"learning_rate": 7.085450546841977e-06, |
|
"loss": 0.2378, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.566520714045133, |
|
"grad_norm": 0.5580633878707886, |
|
"learning_rate": 7.0678292952757986e-06, |
|
"loss": 0.2466, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.5671943415291344, |
|
"grad_norm": 0.5392370223999023, |
|
"learning_rate": 7.050210436983152e-06, |
|
"loss": 0.2847, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.5678679690131357, |
|
"grad_norm": 0.5429926514625549, |
|
"learning_rate": 7.032594069533694e-06, |
|
"loss": 0.2589, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.5685415964971371, |
|
"grad_norm": 0.529365062713623, |
|
"learning_rate": 7.0149802904832865e-06, |
|
"loss": 0.2692, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.5692152239811384, |
|
"grad_norm": 0.5019341707229614, |
|
"learning_rate": 6.997369197373462e-06, |
|
"loss": 0.2501, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.5698888514651398, |
|
"grad_norm": 0.5088992714881897, |
|
"learning_rate": 6.979760887730873e-06, |
|
"loss": 0.2741, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.5705624789491411, |
|
"grad_norm": 0.5390922427177429, |
|
"learning_rate": 6.962155459066755e-06, |
|
"loss": 0.2653, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.5712361064331425, |
|
"grad_norm": 0.5300227403640747, |
|
"learning_rate": 6.9445530088764015e-06, |
|
"loss": 0.2356, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.5719097339171438, |
|
"grad_norm": 0.5471487641334534, |
|
"learning_rate": 6.926953634638598e-06, |
|
"loss": 0.2434, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.5725833614011452, |
|
"grad_norm": 0.49165770411491394, |
|
"learning_rate": 6.909357433815104e-06, |
|
"loss": 0.2539, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5732569888851465, |
|
"grad_norm": 0.5154786705970764, |
|
"learning_rate": 6.891764503850109e-06, |
|
"loss": 0.2525, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.5739306163691479, |
|
"grad_norm": 0.5185630321502686, |
|
"learning_rate": 6.874174942169674e-06, |
|
"loss": 0.2709, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.5746042438531492, |
|
"grad_norm": 0.5015746355056763, |
|
"learning_rate": 6.856588846181228e-06, |
|
"loss": 0.2522, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.5752778713371506, |
|
"grad_norm": 0.5378702282905579, |
|
"learning_rate": 6.839006313272989e-06, |
|
"loss": 0.2634, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.5759514988211519, |
|
"grad_norm": 0.5816572308540344, |
|
"learning_rate": 6.82142744081345e-06, |
|
"loss": 0.3396, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.5766251263051533, |
|
"grad_norm": 0.5909308791160583, |
|
"learning_rate": 6.803852326150838e-06, |
|
"loss": 0.2834, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.5772987537891546, |
|
"grad_norm": 0.5006569623947144, |
|
"learning_rate": 6.786281066612564e-06, |
|
"loss": 0.212, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.577972381273156, |
|
"grad_norm": 0.5730767846107483, |
|
"learning_rate": 6.768713759504694e-06, |
|
"loss": 0.2998, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.5786460087571573, |
|
"grad_norm": 0.5159865617752075, |
|
"learning_rate": 6.751150502111406e-06, |
|
"loss": 0.2685, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.5793196362411587, |
|
"grad_norm": 0.5225328803062439, |
|
"learning_rate": 6.733591391694444e-06, |
|
"loss": 0.2404, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.57999326372516, |
|
"grad_norm": 0.540481686592102, |
|
"learning_rate": 6.7160365254926005e-06, |
|
"loss": 0.265, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.5806668912091614, |
|
"grad_norm": 0.5876161456108093, |
|
"learning_rate": 6.698486000721151e-06, |
|
"loss": 0.2758, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.5813405186931627, |
|
"grad_norm": 0.5269771218299866, |
|
"learning_rate": 6.680939914571336e-06, |
|
"loss": 0.2497, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.5820141461771641, |
|
"grad_norm": 0.5683711171150208, |
|
"learning_rate": 6.663398364209817e-06, |
|
"loss": 0.2895, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.5826877736611654, |
|
"grad_norm": 0.5690784454345703, |
|
"learning_rate": 6.645861446778131e-06, |
|
"loss": 0.2927, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.5833614011451668, |
|
"grad_norm": 0.4923837184906006, |
|
"learning_rate": 6.628329259392169e-06, |
|
"loss": 0.2294, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.5840350286291681, |
|
"grad_norm": 0.5871672630310059, |
|
"learning_rate": 6.610801899141618e-06, |
|
"loss": 0.2883, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.5847086561131695, |
|
"grad_norm": 0.5314139127731323, |
|
"learning_rate": 6.593279463089433e-06, |
|
"loss": 0.2698, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.5853822835971708, |
|
"grad_norm": 0.4713616967201233, |
|
"learning_rate": 6.575762048271311e-06, |
|
"loss": 0.2551, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.5860559110811722, |
|
"grad_norm": 0.5604876279830933, |
|
"learning_rate": 6.558249751695129e-06, |
|
"loss": 0.2507, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5867295385651735, |
|
"grad_norm": 0.5332925319671631, |
|
"learning_rate": 6.54074267034043e-06, |
|
"loss": 0.2921, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.5874031660491748, |
|
"grad_norm": 0.5870206356048584, |
|
"learning_rate": 6.523240901157874e-06, |
|
"loss": 0.305, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.5880767935331761, |
|
"grad_norm": 0.5209013223648071, |
|
"learning_rate": 6.505744541068696e-06, |
|
"loss": 0.2504, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.5887504210171774, |
|
"grad_norm": 0.5347055196762085, |
|
"learning_rate": 6.488253686964189e-06, |
|
"loss": 0.26, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.5894240485011788, |
|
"grad_norm": 0.5568848252296448, |
|
"learning_rate": 6.470768435705146e-06, |
|
"loss": 0.2506, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.5900976759851801, |
|
"grad_norm": 0.4880235493183136, |
|
"learning_rate": 6.45328888412133e-06, |
|
"loss": 0.2549, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.5907713034691815, |
|
"grad_norm": 0.5328478217124939, |
|
"learning_rate": 6.435815129010952e-06, |
|
"loss": 0.2892, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.5914449309531828, |
|
"grad_norm": 0.5507891178131104, |
|
"learning_rate": 6.418347267140113e-06, |
|
"loss": 0.295, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.5921185584371842, |
|
"grad_norm": 0.5917878150939941, |
|
"learning_rate": 6.400885395242284e-06, |
|
"loss": 0.2775, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.5927921859211855, |
|
"grad_norm": 0.5396655201911926, |
|
"learning_rate": 6.383429610017763e-06, |
|
"loss": 0.2601, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5934658134051869, |
|
"grad_norm": 0.5640776753425598, |
|
"learning_rate": 6.3659800081331375e-06, |
|
"loss": 0.2532, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.5941394408891882, |
|
"grad_norm": 0.5693733096122742, |
|
"learning_rate": 6.348536686220761e-06, |
|
"loss": 0.276, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.5948130683731896, |
|
"grad_norm": 0.49299901723861694, |
|
"learning_rate": 6.331099740878201e-06, |
|
"loss": 0.2197, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.5954866958571909, |
|
"grad_norm": 0.5112996697425842, |
|
"learning_rate": 6.3136692686677204e-06, |
|
"loss": 0.2685, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.5961603233411923, |
|
"grad_norm": 0.5770703554153442, |
|
"learning_rate": 6.2962453661157305e-06, |
|
"loss": 0.2439, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.5968339508251936, |
|
"grad_norm": 0.5604544878005981, |
|
"learning_rate": 6.2788281297122605e-06, |
|
"loss": 0.2603, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.597507578309195, |
|
"grad_norm": 0.5164006948471069, |
|
"learning_rate": 6.261417655910432e-06, |
|
"loss": 0.2419, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.5981812057931963, |
|
"grad_norm": 0.5085450410842896, |
|
"learning_rate": 6.244014041125906e-06, |
|
"loss": 0.2714, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.5988548332771977, |
|
"grad_norm": 0.5820232629776001, |
|
"learning_rate": 6.226617381736361e-06, |
|
"loss": 0.2909, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.599528460761199, |
|
"grad_norm": 0.5919815301895142, |
|
"learning_rate": 6.209227774080969e-06, |
|
"loss": 0.3283, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6002020882452004, |
|
"grad_norm": 0.5612049102783203, |
|
"learning_rate": 6.191845314459836e-06, |
|
"loss": 0.2623, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.6008757157292017, |
|
"grad_norm": 0.5206785798072815, |
|
"learning_rate": 6.174470099133495e-06, |
|
"loss": 0.2391, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.6015493432132031, |
|
"grad_norm": 0.5109294652938843, |
|
"learning_rate": 6.157102224322357e-06, |
|
"loss": 0.2435, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.6022229706972044, |
|
"grad_norm": 0.5124114155769348, |
|
"learning_rate": 6.13974178620618e-06, |
|
"loss": 0.2508, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.6028965981812058, |
|
"grad_norm": 0.538691520690918, |
|
"learning_rate": 6.1223888809235475e-06, |
|
"loss": 0.2742, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.6035702256652071, |
|
"grad_norm": 0.4782629609107971, |
|
"learning_rate": 6.105043604571319e-06, |
|
"loss": 0.215, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.6042438531492085, |
|
"grad_norm": 0.48708873987197876, |
|
"learning_rate": 6.087706053204106e-06, |
|
"loss": 0.2685, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.6049174806332098, |
|
"grad_norm": 0.5199108719825745, |
|
"learning_rate": 6.070376322833751e-06, |
|
"loss": 0.2522, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.6055911081172112, |
|
"grad_norm": 0.5264055728912354, |
|
"learning_rate": 6.053054509428774e-06, |
|
"loss": 0.2702, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.6062647356012125, |
|
"grad_norm": 0.5014949440956116, |
|
"learning_rate": 6.035740708913861e-06, |
|
"loss": 0.2592, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6062647356012125, |
|
"eval_loss": 0.255189448595047, |
|
"eval_runtime": 106.7863, |
|
"eval_samples_per_second": 46.823, |
|
"eval_steps_per_second": 2.931, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6069383630852139, |
|
"grad_norm": 0.5549845695495605, |
|
"learning_rate": 6.01843501716932e-06, |
|
"loss": 0.2676, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.6076119905692152, |
|
"grad_norm": 0.5285577178001404, |
|
"learning_rate": 6.001137530030551e-06, |
|
"loss": 0.287, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.6082856180532166, |
|
"grad_norm": 0.5555633306503296, |
|
"learning_rate": 5.983848343287529e-06, |
|
"loss": 0.27, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.6089592455372179, |
|
"grad_norm": 0.4878551661968231, |
|
"learning_rate": 5.966567552684248e-06, |
|
"loss": 0.2132, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.6096328730212193, |
|
"grad_norm": 0.5712552070617676, |
|
"learning_rate": 5.949295253918223e-06, |
|
"loss": 0.264, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.6103065005052206, |
|
"grad_norm": 0.5029177665710449, |
|
"learning_rate": 5.932031542639929e-06, |
|
"loss": 0.2327, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.610980127989222, |
|
"grad_norm": 0.5280793309211731, |
|
"learning_rate": 5.914776514452292e-06, |
|
"loss": 0.2666, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.6116537554732233, |
|
"grad_norm": 0.5493948459625244, |
|
"learning_rate": 5.897530264910151e-06, |
|
"loss": 0.2747, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.6123273829572247, |
|
"grad_norm": 0.5202659964561462, |
|
"learning_rate": 5.880292889519733e-06, |
|
"loss": 0.2648, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.613001010441226, |
|
"grad_norm": 0.5150463581085205, |
|
"learning_rate": 5.863064483738114e-06, |
|
"loss": 0.2465, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6136746379252274, |
|
"grad_norm": 0.5893501043319702, |
|
"learning_rate": 5.845845142972711e-06, |
|
"loss": 0.258, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.6143482654092287, |
|
"grad_norm": 0.5318591594696045, |
|
"learning_rate": 5.828634962580728e-06, |
|
"loss": 0.2566, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.6150218928932301, |
|
"grad_norm": 0.5880672335624695, |
|
"learning_rate": 5.811434037868652e-06, |
|
"loss": 0.2776, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.6156955203772314, |
|
"grad_norm": 0.536673903465271, |
|
"learning_rate": 5.794242464091703e-06, |
|
"loss": 0.2655, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.6163691478612328, |
|
"grad_norm": 0.53472501039505, |
|
"learning_rate": 5.777060336453324e-06, |
|
"loss": 0.2465, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.6170427753452341, |
|
"grad_norm": 0.5193040370941162, |
|
"learning_rate": 5.75988775010465e-06, |
|
"loss": 0.2597, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.6177164028292355, |
|
"grad_norm": 0.5033950209617615, |
|
"learning_rate": 5.742724800143967e-06, |
|
"loss": 0.2564, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.6183900303132368, |
|
"grad_norm": 0.479815274477005, |
|
"learning_rate": 5.725571581616212e-06, |
|
"loss": 0.2359, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.6190636577972382, |
|
"grad_norm": 0.6129284501075745, |
|
"learning_rate": 5.708428189512418e-06, |
|
"loss": 0.2789, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.6197372852812395, |
|
"grad_norm": 0.5521009564399719, |
|
"learning_rate": 5.691294718769205e-06, |
|
"loss": 0.2605, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.6204109127652409, |
|
"grad_norm": 0.5177654027938843, |
|
"learning_rate": 5.674171264268255e-06, |
|
"loss": 0.2519, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.6210845402492422, |
|
"grad_norm": 0.5175455808639526, |
|
"learning_rate": 5.657057920835781e-06, |
|
"loss": 0.2247, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.6217581677332435, |
|
"grad_norm": 0.5852333903312683, |
|
"learning_rate": 5.639954783241994e-06, |
|
"loss": 0.2767, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.6224317952172449, |
|
"grad_norm": 0.508068323135376, |
|
"learning_rate": 5.622861946200602e-06, |
|
"loss": 0.2584, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.6231054227012462, |
|
"grad_norm": 0.5253134965896606, |
|
"learning_rate": 5.605779504368256e-06, |
|
"loss": 0.2479, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.6237790501852476, |
|
"grad_norm": 0.5303956866264343, |
|
"learning_rate": 5.588707552344052e-06, |
|
"loss": 0.2445, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.624452677669249, |
|
"grad_norm": 0.5583487749099731, |
|
"learning_rate": 5.571646184668989e-06, |
|
"loss": 0.2703, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.6251263051532503, |
|
"grad_norm": 0.48656296730041504, |
|
"learning_rate": 5.5545954958254535e-06, |
|
"loss": 0.22, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.6257999326372516, |
|
"grad_norm": 0.5838629603385925, |
|
"learning_rate": 5.537555580236696e-06, |
|
"loss": 0.2995, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.626473560121253, |
|
"grad_norm": 0.4895997643470764, |
|
"learning_rate": 5.520526532266303e-06, |
|
"loss": 0.2508, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.6271471876052543, |
|
"grad_norm": 0.5736584663391113, |
|
"learning_rate": 5.503508446217687e-06, |
|
"loss": 0.2738, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.6278208150892557, |
|
"grad_norm": 0.507853627204895, |
|
"learning_rate": 5.486501416333547e-06, |
|
"loss": 0.2342, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.628494442573257, |
|
"grad_norm": 0.5749799013137817, |
|
"learning_rate": 5.469505536795354e-06, |
|
"loss": 0.2505, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.6291680700572584, |
|
"grad_norm": 0.5327485203742981, |
|
"learning_rate": 5.452520901722843e-06, |
|
"loss": 0.2444, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.6298416975412597, |
|
"grad_norm": 0.5296816229820251, |
|
"learning_rate": 5.435547605173464e-06, |
|
"loss": 0.2369, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.630515325025261, |
|
"grad_norm": 0.568265974521637, |
|
"learning_rate": 5.4185857411418856e-06, |
|
"loss": 0.2668, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.6311889525092623, |
|
"grad_norm": 0.571258544921875, |
|
"learning_rate": 5.401635403559467e-06, |
|
"loss": 0.2651, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.6318625799932637, |
|
"grad_norm": 0.5336675047874451, |
|
"learning_rate": 5.384696686293728e-06, |
|
"loss": 0.2571, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.632536207477265, |
|
"grad_norm": 0.5422372221946716, |
|
"learning_rate": 5.367769683147849e-06, |
|
"loss": 0.2474, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.6332098349612664, |
|
"grad_norm": 0.5360538363456726, |
|
"learning_rate": 5.350854487860127e-06, |
|
"loss": 0.2612, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6338834624452677, |
|
"grad_norm": 0.5526731014251709, |
|
"learning_rate": 5.333951194103476e-06, |
|
"loss": 0.291, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.6345570899292691, |
|
"grad_norm": 0.501751720905304, |
|
"learning_rate": 5.317059895484905e-06, |
|
"loss": 0.2305, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.6352307174132704, |
|
"grad_norm": 0.5227620005607605, |
|
"learning_rate": 5.300180685544992e-06, |
|
"loss": 0.2425, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.6359043448972718, |
|
"grad_norm": 0.4993986189365387, |
|
"learning_rate": 5.28331365775737e-06, |
|
"loss": 0.2426, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.6365779723812731, |
|
"grad_norm": 0.5128391981124878, |
|
"learning_rate": 5.266458905528214e-06, |
|
"loss": 0.2635, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.6372515998652745, |
|
"grad_norm": 0.5762251615524292, |
|
"learning_rate": 5.2496165221957105e-06, |
|
"loss": 0.2652, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.6379252273492758, |
|
"grad_norm": 0.48678985238075256, |
|
"learning_rate": 5.232786601029562e-06, |
|
"loss": 0.2518, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.6385988548332772, |
|
"grad_norm": 0.5538300275802612, |
|
"learning_rate": 5.215969235230447e-06, |
|
"loss": 0.2489, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.6392724823172785, |
|
"grad_norm": 0.5497295260429382, |
|
"learning_rate": 5.199164517929521e-06, |
|
"loss": 0.2454, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.6399461098012799, |
|
"grad_norm": 0.49725213646888733, |
|
"learning_rate": 5.182372542187895e-06, |
|
"loss": 0.2555, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6406197372852812, |
|
"grad_norm": 0.533641517162323, |
|
"learning_rate": 5.165593400996114e-06, |
|
"loss": 0.2927, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.6412933647692826, |
|
"grad_norm": 0.5489848256111145, |
|
"learning_rate": 5.148827187273657e-06, |
|
"loss": 0.2801, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.6419669922532839, |
|
"grad_norm": 0.5017451643943787, |
|
"learning_rate": 5.132073993868406e-06, |
|
"loss": 0.264, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.6426406197372853, |
|
"grad_norm": 0.5372846722602844, |
|
"learning_rate": 5.115333913556137e-06, |
|
"loss": 0.2721, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.6433142472212866, |
|
"grad_norm": 0.5356566309928894, |
|
"learning_rate": 5.098607039040019e-06, |
|
"loss": 0.2608, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.643987874705288, |
|
"grad_norm": 0.5957320928573608, |
|
"learning_rate": 5.081893462950079e-06, |
|
"loss": 0.2601, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.6446615021892893, |
|
"grad_norm": 0.5288376212120056, |
|
"learning_rate": 5.0651932778427074e-06, |
|
"loss": 0.2587, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.6453351296732907, |
|
"grad_norm": 0.5290555953979492, |
|
"learning_rate": 5.048506576200137e-06, |
|
"loss": 0.2756, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.646008757157292, |
|
"grad_norm": 0.5248770117759705, |
|
"learning_rate": 5.031833450429925e-06, |
|
"loss": 0.2451, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.6466823846412934, |
|
"grad_norm": 0.5826844573020935, |
|
"learning_rate": 5.0151739928644585e-06, |
|
"loss": 0.2619, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.6473560121252947, |
|
"grad_norm": 0.5782036185264587, |
|
"learning_rate": 4.998528295760426e-06, |
|
"loss": 0.2751, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.648029639609296, |
|
"grad_norm": 0.5718022584915161, |
|
"learning_rate": 4.981896451298311e-06, |
|
"loss": 0.2754, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.6487032670932974, |
|
"grad_norm": 0.5494672060012817, |
|
"learning_rate": 4.965278551581896e-06, |
|
"loss": 0.2612, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.6493768945772987, |
|
"grad_norm": 0.5269261002540588, |
|
"learning_rate": 4.948674688637724e-06, |
|
"loss": 0.2498, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.6500505220613001, |
|
"grad_norm": 0.5737338662147522, |
|
"learning_rate": 4.932084954414619e-06, |
|
"loss": 0.2512, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.6507241495453014, |
|
"grad_norm": 0.6112325191497803, |
|
"learning_rate": 4.915509440783158e-06, |
|
"loss": 0.2436, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.6513977770293028, |
|
"grad_norm": 0.5490378737449646, |
|
"learning_rate": 4.898948239535162e-06, |
|
"loss": 0.2666, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.6520714045133041, |
|
"grad_norm": 0.49087807536125183, |
|
"learning_rate": 4.882401442383205e-06, |
|
"loss": 0.2307, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.6527450319973055, |
|
"grad_norm": 0.5699329972267151, |
|
"learning_rate": 4.865869140960081e-06, |
|
"loss": 0.2788, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.6534186594813068, |
|
"grad_norm": 0.5976101756095886, |
|
"learning_rate": 4.8493514268183154e-06, |
|
"loss": 0.295, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.6540922869653082, |
|
"grad_norm": 0.5223735570907593, |
|
"learning_rate": 4.8328483914296545e-06, |
|
"loss": 0.2524, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.6547659144493095, |
|
"grad_norm": 0.521709680557251, |
|
"learning_rate": 4.816360126184552e-06, |
|
"loss": 0.256, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.6554395419333109, |
|
"grad_norm": 0.6724926829338074, |
|
"learning_rate": 4.799886722391676e-06, |
|
"loss": 0.3489, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.6561131694173122, |
|
"grad_norm": 0.5592423677444458, |
|
"learning_rate": 4.783428271277383e-06, |
|
"loss": 0.2486, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.6567867969013136, |
|
"grad_norm": 0.5178484320640564, |
|
"learning_rate": 4.766984863985229e-06, |
|
"loss": 0.231, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.6574604243853149, |
|
"grad_norm": 0.5621674060821533, |
|
"learning_rate": 4.750556591575467e-06, |
|
"loss": 0.286, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.6581340518693163, |
|
"grad_norm": 0.5048483610153198, |
|
"learning_rate": 4.734143545024527e-06, |
|
"loss": 0.2308, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.6588076793533176, |
|
"grad_norm": 0.5381827354431152, |
|
"learning_rate": 4.7177458152245286e-06, |
|
"loss": 0.262, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.659481306837319, |
|
"grad_norm": 0.6153239607810974, |
|
"learning_rate": 4.701363492982763e-06, |
|
"loss": 0.2889, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.6601549343213203, |
|
"grad_norm": 0.5119926333427429, |
|
"learning_rate": 4.684996669021202e-06, |
|
"loss": 0.2313, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.6608285618053217, |
|
"grad_norm": 0.5575754046440125, |
|
"learning_rate": 4.668645433975994e-06, |
|
"loss": 0.2926, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.661502189289323, |
|
"grad_norm": 0.5210109949111938, |
|
"learning_rate": 4.652309878396955e-06, |
|
"loss": 0.2567, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.6621758167733244, |
|
"grad_norm": 0.5215359330177307, |
|
"learning_rate": 4.635990092747066e-06, |
|
"loss": 0.2542, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.6628494442573257, |
|
"grad_norm": 0.5880634188652039, |
|
"learning_rate": 4.619686167401991e-06, |
|
"loss": 0.3099, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.6635230717413271, |
|
"grad_norm": 0.6169697046279907, |
|
"learning_rate": 4.603398192649549e-06, |
|
"loss": 0.3095, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.6641966992253284, |
|
"grad_norm": 0.5105169415473938, |
|
"learning_rate": 4.5871262586892365e-06, |
|
"loss": 0.2439, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.6648703267093298, |
|
"grad_norm": 0.5112780928611755, |
|
"learning_rate": 4.5708704556317195e-06, |
|
"loss": 0.2843, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.6655439541933311, |
|
"grad_norm": 0.5523808002471924, |
|
"learning_rate": 4.554630873498325e-06, |
|
"loss": 0.2779, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.6662175816773325, |
|
"grad_norm": 0.49872297048568726, |
|
"learning_rate": 4.538407602220566e-06, |
|
"loss": 0.2385, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.6668912091613338, |
|
"grad_norm": 0.4888902008533478, |
|
"learning_rate": 4.522200731639616e-06, |
|
"loss": 0.2541, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.6675648366453352, |
|
"grad_norm": 0.5053279995918274, |
|
"learning_rate": 4.506010351505834e-06, |
|
"loss": 0.2465, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.6682384641293365, |
|
"grad_norm": 0.5656309723854065, |
|
"learning_rate": 4.489836551478254e-06, |
|
"loss": 0.2878, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.6689120916133379, |
|
"grad_norm": 0.5291764736175537, |
|
"learning_rate": 4.473679421124092e-06, |
|
"loss": 0.2803, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.6695857190973392, |
|
"grad_norm": 0.5425894260406494, |
|
"learning_rate": 4.457539049918253e-06, |
|
"loss": 0.2758, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.6702593465813406, |
|
"grad_norm": 0.5237170457839966, |
|
"learning_rate": 4.441415527242835e-06, |
|
"loss": 0.2615, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.6709329740653419, |
|
"grad_norm": 0.48956528306007385, |
|
"learning_rate": 4.425308942386624e-06, |
|
"loss": 0.2502, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.6716066015493433, |
|
"grad_norm": 0.5325567722320557, |
|
"learning_rate": 4.409219384544621e-06, |
|
"loss": 0.2663, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.6722802290333446, |
|
"grad_norm": 0.5196683406829834, |
|
"learning_rate": 4.3931469428175195e-06, |
|
"loss": 0.2785, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.672953856517346, |
|
"grad_norm": 0.5358217358589172, |
|
"learning_rate": 4.377091706211243e-06, |
|
"loss": 0.2701, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.6736274840013473, |
|
"grad_norm": 0.5513088703155518, |
|
"learning_rate": 4.3610537636364256e-06, |
|
"loss": 0.2583, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6736274840013473, |
|
"eval_loss": 0.2523915767669678, |
|
"eval_runtime": 104.4369, |
|
"eval_samples_per_second": 47.876, |
|
"eval_steps_per_second": 2.997, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6743011114853485, |
|
"grad_norm": 0.599454402923584, |
|
"learning_rate": 4.345033203907931e-06, |
|
"loss": 0.3127, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 0.6749747389693499, |
|
"grad_norm": 0.5195282697677612, |
|
"learning_rate": 4.329030115744368e-06, |
|
"loss": 0.2336, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 0.6756483664533512, |
|
"grad_norm": 0.5394783616065979, |
|
"learning_rate": 4.313044587767581e-06, |
|
"loss": 0.2266, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 0.6763219939373526, |
|
"grad_norm": 0.502860963344574, |
|
"learning_rate": 4.297076708502179e-06, |
|
"loss": 0.2226, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 0.6769956214213539, |
|
"grad_norm": 0.5646010637283325, |
|
"learning_rate": 4.281126566375035e-06, |
|
"loss": 0.2612, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.6776692489053553, |
|
"grad_norm": 0.5330033898353577, |
|
"learning_rate": 4.265194249714788e-06, |
|
"loss": 0.27, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 0.6783428763893566, |
|
"grad_norm": 0.5152742266654968, |
|
"learning_rate": 4.249279846751376e-06, |
|
"loss": 0.2522, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 0.679016503873358, |
|
"grad_norm": 0.5699672698974609, |
|
"learning_rate": 4.233383445615524e-06, |
|
"loss": 0.3023, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.6796901313573593, |
|
"grad_norm": 0.5329395532608032, |
|
"learning_rate": 4.21750513433827e-06, |
|
"loss": 0.2413, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 0.6803637588413607, |
|
"grad_norm": 0.4887201488018036, |
|
"learning_rate": 4.201645000850481e-06, |
|
"loss": 0.24, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.681037386325362, |
|
"grad_norm": 0.49501362442970276, |
|
"learning_rate": 4.1858031329823445e-06, |
|
"loss": 0.2288, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 0.6817110138093634, |
|
"grad_norm": 0.48089247941970825, |
|
"learning_rate": 4.169979618462912e-06, |
|
"loss": 0.2311, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 0.6823846412933647, |
|
"grad_norm": 0.5128735899925232, |
|
"learning_rate": 4.154174544919591e-06, |
|
"loss": 0.2342, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 0.6830582687773661, |
|
"grad_norm": 0.5249293446540833, |
|
"learning_rate": 4.13838799987766e-06, |
|
"loss": 0.2799, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 0.6837318962613674, |
|
"grad_norm": 0.5358514785766602, |
|
"learning_rate": 4.122620070759805e-06, |
|
"loss": 0.2569, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.6844055237453688, |
|
"grad_norm": 0.4961945712566376, |
|
"learning_rate": 4.106870844885606e-06, |
|
"loss": 0.2856, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 0.6850791512293701, |
|
"grad_norm": 0.5068737268447876, |
|
"learning_rate": 4.091140409471082e-06, |
|
"loss": 0.247, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 0.6857527787133715, |
|
"grad_norm": 0.5845658779144287, |
|
"learning_rate": 4.0754288516281805e-06, |
|
"loss": 0.3199, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 0.6864264061973728, |
|
"grad_norm": 0.5782644152641296, |
|
"learning_rate": 4.05973625836432e-06, |
|
"loss": 0.277, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 0.6871000336813742, |
|
"grad_norm": 0.4946306347846985, |
|
"learning_rate": 4.044062716581894e-06, |
|
"loss": 0.2596, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.6877736611653755, |
|
"grad_norm": 0.4905906915664673, |
|
"learning_rate": 4.02840831307779e-06, |
|
"loss": 0.243, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 0.6884472886493769, |
|
"grad_norm": 0.5413460731506348, |
|
"learning_rate": 4.012773134542911e-06, |
|
"loss": 0.2787, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 0.6891209161333782, |
|
"grad_norm": 0.5733334422111511, |
|
"learning_rate": 3.997157267561701e-06, |
|
"loss": 0.2473, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 0.6897945436173796, |
|
"grad_norm": 0.5300191044807434, |
|
"learning_rate": 3.981560798611655e-06, |
|
"loss": 0.2451, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.6904681711013809, |
|
"grad_norm": 0.5103181600570679, |
|
"learning_rate": 3.965983814062852e-06, |
|
"loss": 0.2519, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.6911417985853823, |
|
"grad_norm": 0.6034629344940186, |
|
"learning_rate": 3.950426400177465e-06, |
|
"loss": 0.2702, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 0.6918154260693836, |
|
"grad_norm": 0.5429885387420654, |
|
"learning_rate": 3.934888643109288e-06, |
|
"loss": 0.2549, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 0.692489053553385, |
|
"grad_norm": 0.5191195011138916, |
|
"learning_rate": 3.919370628903266e-06, |
|
"loss": 0.263, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 0.6931626810373863, |
|
"grad_norm": 0.5467904210090637, |
|
"learning_rate": 3.903872443495005e-06, |
|
"loss": 0.2502, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 0.6938363085213877, |
|
"grad_norm": 0.5506168007850647, |
|
"learning_rate": 3.888394172710305e-06, |
|
"loss": 0.2731, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.694509936005389, |
|
"grad_norm": 0.5254278182983398, |
|
"learning_rate": 3.872935902264689e-06, |
|
"loss": 0.2547, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 0.6951835634893904, |
|
"grad_norm": 0.5569198131561279, |
|
"learning_rate": 3.857497717762911e-06, |
|
"loss": 0.2644, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.6958571909733917, |
|
"grad_norm": 0.5069310069084167, |
|
"learning_rate": 3.8420797046985024e-06, |
|
"loss": 0.2643, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 0.6965308184573931, |
|
"grad_norm": 0.539691686630249, |
|
"learning_rate": 3.826681948453288e-06, |
|
"loss": 0.259, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 0.6972044459413944, |
|
"grad_norm": 0.49122515320777893, |
|
"learning_rate": 3.8113045342969083e-06, |
|
"loss": 0.2326, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.6978780734253958, |
|
"grad_norm": 0.5575307011604309, |
|
"learning_rate": 3.7959475473863624e-06, |
|
"loss": 0.262, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.6985517009093971, |
|
"grad_norm": 0.48936426639556885, |
|
"learning_rate": 3.7806110727655185e-06, |
|
"loss": 0.2561, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 0.6992253283933985, |
|
"grad_norm": 0.5563534498214722, |
|
"learning_rate": 3.76529519536466e-06, |
|
"loss": 0.2706, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 0.6998989558773998, |
|
"grad_norm": 0.567457377910614, |
|
"learning_rate": 3.750000000000002e-06, |
|
"loss": 0.2453, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 0.7005725833614012, |
|
"grad_norm": 0.5170741677284241, |
|
"learning_rate": 3.7347255713732236e-06, |
|
"loss": 0.2199, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7012462108454025, |
|
"grad_norm": 0.5691453218460083, |
|
"learning_rate": 3.7194719940710135e-06, |
|
"loss": 0.2831, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 0.7019198383294039, |
|
"grad_norm": 0.5637477040290833, |
|
"learning_rate": 3.7042393525645793e-06, |
|
"loss": 0.2747, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 0.7025934658134052, |
|
"grad_norm": 0.5031242966651917, |
|
"learning_rate": 3.689027731209191e-06, |
|
"loss": 0.2321, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 0.7032670932974066, |
|
"grad_norm": 0.6389548182487488, |
|
"learning_rate": 3.6738372142437223e-06, |
|
"loss": 0.2598, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.7039407207814079, |
|
"grad_norm": 0.5508800148963928, |
|
"learning_rate": 3.6586678857901624e-06, |
|
"loss": 0.2607, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.7046143482654093, |
|
"grad_norm": 0.5346333980560303, |
|
"learning_rate": 3.6435198298531762e-06, |
|
"loss": 0.2484, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 0.7052879757494106, |
|
"grad_norm": 0.5504537224769592, |
|
"learning_rate": 3.6283931303196123e-06, |
|
"loss": 0.2751, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 0.705961603233412, |
|
"grad_norm": 0.5008198618888855, |
|
"learning_rate": 3.6132878709580612e-06, |
|
"loss": 0.235, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 0.7066352307174133, |
|
"grad_norm": 0.5508736371994019, |
|
"learning_rate": 3.5982041354183843e-06, |
|
"loss": 0.2627, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 0.7073088582014146, |
|
"grad_norm": 0.5439531207084656, |
|
"learning_rate": 3.583142007231235e-06, |
|
"loss": 0.2524, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.707982485685416, |
|
"grad_norm": 0.5159290432929993, |
|
"learning_rate": 3.5681015698076254e-06, |
|
"loss": 0.2323, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 0.7086561131694173, |
|
"grad_norm": 0.5393319725990295, |
|
"learning_rate": 3.5530829064384378e-06, |
|
"loss": 0.2732, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 0.7093297406534187, |
|
"grad_norm": 0.5629071593284607, |
|
"learning_rate": 3.5380861002939764e-06, |
|
"loss": 0.2651, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 0.71000336813742, |
|
"grad_norm": 0.4863939881324768, |
|
"learning_rate": 3.523111234423509e-06, |
|
"loss": 0.2181, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 0.7106769956214214, |
|
"grad_norm": 0.4968824088573456, |
|
"learning_rate": 3.508158391754798e-06, |
|
"loss": 0.22, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.7113506231054227, |
|
"grad_norm": 0.49191296100616455, |
|
"learning_rate": 3.493227655093645e-06, |
|
"loss": 0.246, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.7120242505894241, |
|
"grad_norm": 0.526577353477478, |
|
"learning_rate": 3.4783191071234387e-06, |
|
"loss": 0.2494, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 0.7126978780734254, |
|
"grad_norm": 0.5475011467933655, |
|
"learning_rate": 3.463432830404685e-06, |
|
"loss": 0.2609, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 0.7133715055574268, |
|
"grad_norm": 0.5295203328132629, |
|
"learning_rate": 3.448568907374563e-06, |
|
"loss": 0.2494, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 0.7140451330414281, |
|
"grad_norm": 0.5042027831077576, |
|
"learning_rate": 3.4337274203464523e-06, |
|
"loss": 0.2266, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.7147187605254295, |
|
"grad_norm": 0.5058079957962036, |
|
"learning_rate": 3.4189084515094974e-06, |
|
"loss": 0.2344, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 0.7153923880094308, |
|
"grad_norm": 0.5442999601364136, |
|
"learning_rate": 3.40411208292813e-06, |
|
"loss": 0.2545, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 0.7160660154934322, |
|
"grad_norm": 0.578435480594635, |
|
"learning_rate": 3.3893383965416355e-06, |
|
"loss": 0.2534, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 0.7167396429774335, |
|
"grad_norm": 0.5491860508918762, |
|
"learning_rate": 3.37458747416369e-06, |
|
"loss": 0.3073, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.7174132704614348, |
|
"grad_norm": 0.49808141589164734, |
|
"learning_rate": 3.3598593974818997e-06, |
|
"loss": 0.2254, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.7180868979454361, |
|
"grad_norm": 0.5253027081489563, |
|
"learning_rate": 3.345154248057359e-06, |
|
"loss": 0.2227, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 0.7187605254294375, |
|
"grad_norm": 0.5097954273223877, |
|
"learning_rate": 3.3304721073242004e-06, |
|
"loss": 0.2159, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 0.7194341529134388, |
|
"grad_norm": 0.5558974146842957, |
|
"learning_rate": 3.3158130565891347e-06, |
|
"loss": 0.2458, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.7201077803974402, |
|
"grad_norm": 0.529330849647522, |
|
"learning_rate": 3.3011771770310014e-06, |
|
"loss": 0.2666, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 0.7207814078814415, |
|
"grad_norm": 0.5007720589637756, |
|
"learning_rate": 3.286564549700333e-06, |
|
"loss": 0.2415, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.7214550353654429, |
|
"grad_norm": 0.6243747472763062, |
|
"learning_rate": 3.271975255518884e-06, |
|
"loss": 0.291, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 0.7221286628494442, |
|
"grad_norm": 0.5337501168251038, |
|
"learning_rate": 3.2574093752792068e-06, |
|
"loss": 0.2675, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 0.7228022903334456, |
|
"grad_norm": 0.6054463982582092, |
|
"learning_rate": 3.2428669896441833e-06, |
|
"loss": 0.3009, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 0.7234759178174469, |
|
"grad_norm": 0.5312137007713318, |
|
"learning_rate": 3.228348179146586e-06, |
|
"loss": 0.2513, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.7241495453014483, |
|
"grad_norm": 0.510999858379364, |
|
"learning_rate": 3.2138530241886403e-06, |
|
"loss": 0.2454, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.7248231727854496, |
|
"grad_norm": 0.5202507972717285, |
|
"learning_rate": 3.199381605041571e-06, |
|
"loss": 0.2348, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 0.725496800269451, |
|
"grad_norm": 0.5304343700408936, |
|
"learning_rate": 3.18493400184515e-06, |
|
"loss": 0.2677, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 0.7261704277534523, |
|
"grad_norm": 0.5709294676780701, |
|
"learning_rate": 3.1705102946072746e-06, |
|
"loss": 0.2855, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 0.7268440552374537, |
|
"grad_norm": 0.5579668879508972, |
|
"learning_rate": 3.156110563203498e-06, |
|
"loss": 0.2858, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 0.727517682721455, |
|
"grad_norm": 0.6169936060905457, |
|
"learning_rate": 3.141734887376612e-06, |
|
"loss": 0.2939, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.7281913102054564, |
|
"grad_norm": 0.556327223777771, |
|
"learning_rate": 3.127383346736184e-06, |
|
"loss": 0.2797, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 0.7288649376894577, |
|
"grad_norm": 0.4888077974319458, |
|
"learning_rate": 3.1130560207581275e-06, |
|
"loss": 0.2147, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 0.729538565173459, |
|
"grad_norm": 0.568587601184845, |
|
"learning_rate": 3.098752988784268e-06, |
|
"loss": 0.2786, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 0.7302121926574604, |
|
"grad_norm": 0.5443982481956482, |
|
"learning_rate": 3.084474330021882e-06, |
|
"loss": 0.2445, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 0.7308858201414618, |
|
"grad_norm": 0.4714532494544983, |
|
"learning_rate": 3.070220123543288e-06, |
|
"loss": 0.2044, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.7315594476254631, |
|
"grad_norm": 0.5746622085571289, |
|
"learning_rate": 3.0559904482853808e-06, |
|
"loss": 0.2627, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 0.7322330751094644, |
|
"grad_norm": 0.5493502616882324, |
|
"learning_rate": 3.041785383049206e-06, |
|
"loss": 0.2564, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 0.7329067025934658, |
|
"grad_norm": 0.5477399826049805, |
|
"learning_rate": 3.027605006499536e-06, |
|
"loss": 0.252, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 0.7335803300774671, |
|
"grad_norm": 0.507681667804718, |
|
"learning_rate": 3.013449397164407e-06, |
|
"loss": 0.246, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 0.7342539575614685, |
|
"grad_norm": 0.5245915651321411, |
|
"learning_rate": 2.99931863343471e-06, |
|
"loss": 0.2374, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.7349275850454698, |
|
"grad_norm": 0.526141345500946, |
|
"learning_rate": 2.985212793563745e-06, |
|
"loss": 0.2358, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 0.7356012125294712, |
|
"grad_norm": 0.5303114652633667, |
|
"learning_rate": 2.971131955666782e-06, |
|
"loss": 0.232, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.7362748400134725, |
|
"grad_norm": 0.5322446227073669, |
|
"learning_rate": 2.957076197720644e-06, |
|
"loss": 0.2536, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 0.7369484674974739, |
|
"grad_norm": 0.6054010987281799, |
|
"learning_rate": 2.9430455975632593e-06, |
|
"loss": 0.2825, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 0.7376220949814752, |
|
"grad_norm": 0.4822597801685333, |
|
"learning_rate": 2.9290402328932374e-06, |
|
"loss": 0.2158, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.7382957224654766, |
|
"grad_norm": 0.5092408061027527, |
|
"learning_rate": 2.9150601812694477e-06, |
|
"loss": 0.2434, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 0.7389693499494779, |
|
"grad_norm": 0.48755231499671936, |
|
"learning_rate": 2.901105520110569e-06, |
|
"loss": 0.2489, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 0.7396429774334793, |
|
"grad_norm": 0.5457514524459839, |
|
"learning_rate": 2.887176326694684e-06, |
|
"loss": 0.269, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 0.7403166049174806, |
|
"grad_norm": 0.5498961210250854, |
|
"learning_rate": 2.8732726781588325e-06, |
|
"loss": 0.2446, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 0.740990232401482, |
|
"grad_norm": 0.5210698246955872, |
|
"learning_rate": 2.859394651498592e-06, |
|
"loss": 0.2447, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.740990232401482, |
|
"eval_loss": 0.25004303455352783, |
|
"eval_runtime": 104.4563, |
|
"eval_samples_per_second": 47.867, |
|
"eval_steps_per_second": 2.996, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7416638598854833, |
|
"grad_norm": 0.517212986946106, |
|
"learning_rate": 2.8455423235676586e-06, |
|
"loss": 0.252, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 0.7423374873694847, |
|
"grad_norm": 0.5591882467269897, |
|
"learning_rate": 2.8317157710774066e-06, |
|
"loss": 0.2567, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 0.743011114853486, |
|
"grad_norm": 0.5390484929084778, |
|
"learning_rate": 2.8179150705964713e-06, |
|
"loss": 0.2752, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 0.7436847423374874, |
|
"grad_norm": 0.5284495949745178, |
|
"learning_rate": 2.8041402985503294e-06, |
|
"loss": 0.2248, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.7443583698214887, |
|
"grad_norm": 0.5000547170639038, |
|
"learning_rate": 2.7903915312208696e-06, |
|
"loss": 0.2352, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.7450319973054901, |
|
"grad_norm": 0.5302792191505432, |
|
"learning_rate": 2.7766688447459735e-06, |
|
"loss": 0.2328, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 0.7457056247894914, |
|
"grad_norm": 0.5096173286437988, |
|
"learning_rate": 2.762972315119088e-06, |
|
"loss": 0.2408, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 0.7463792522734928, |
|
"grad_norm": 0.5064148902893066, |
|
"learning_rate": 2.7493020181888058e-06, |
|
"loss": 0.2385, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 0.7470528797574941, |
|
"grad_norm": 0.508243203163147, |
|
"learning_rate": 2.735658029658461e-06, |
|
"loss": 0.2482, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 0.7477265072414955, |
|
"grad_norm": 0.5063915848731995, |
|
"learning_rate": 2.7220404250856833e-06, |
|
"loss": 0.2661, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.7484001347254968, |
|
"grad_norm": 0.5426428318023682, |
|
"learning_rate": 2.7084492798820035e-06, |
|
"loss": 0.2527, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 0.7490737622094982, |
|
"grad_norm": 0.5647068023681641, |
|
"learning_rate": 2.6948846693124188e-06, |
|
"loss": 0.2906, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 0.7497473896934995, |
|
"grad_norm": 0.5713849663734436, |
|
"learning_rate": 2.681346668494985e-06, |
|
"loss": 0.258, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 0.7504210171775009, |
|
"grad_norm": 0.5479983687400818, |
|
"learning_rate": 2.6678353524004027e-06, |
|
"loss": 0.2393, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 0.7510946446615022, |
|
"grad_norm": 0.5582695007324219, |
|
"learning_rate": 2.654350795851593e-06, |
|
"loss": 0.2351, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.7517682721455036, |
|
"grad_norm": 0.5141502618789673, |
|
"learning_rate": 2.640893073523286e-06, |
|
"loss": 0.2587, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 0.7524418996295049, |
|
"grad_norm": 0.5738133788108826, |
|
"learning_rate": 2.6274622599416197e-06, |
|
"loss": 0.2719, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 0.7531155271135063, |
|
"grad_norm": 0.5252017974853516, |
|
"learning_rate": 2.614058429483703e-06, |
|
"loss": 0.2979, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 0.7537891545975076, |
|
"grad_norm": 0.5406326055526733, |
|
"learning_rate": 2.600681656377229e-06, |
|
"loss": 0.2803, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 0.754462782081509, |
|
"grad_norm": 0.5155901312828064, |
|
"learning_rate": 2.587332014700051e-06, |
|
"loss": 0.2645, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.7551364095655103, |
|
"grad_norm": 0.49665388464927673, |
|
"learning_rate": 2.5740095783797656e-06, |
|
"loss": 0.2482, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 0.7558100370495117, |
|
"grad_norm": 0.585488498210907, |
|
"learning_rate": 2.560714421193323e-06, |
|
"loss": 0.3037, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 0.756483664533513, |
|
"grad_norm": 0.5360546708106995, |
|
"learning_rate": 2.547446616766597e-06, |
|
"loss": 0.2697, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 0.7571572920175144, |
|
"grad_norm": 0.5727128982543945, |
|
"learning_rate": 2.534206238573997e-06, |
|
"loss": 0.2627, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 0.7578309195015157, |
|
"grad_norm": 0.5131103992462158, |
|
"learning_rate": 2.5209933599380443e-06, |
|
"loss": 0.2576, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.7585045469855171, |
|
"grad_norm": 0.5608295798301697, |
|
"learning_rate": 2.507808054028972e-06, |
|
"loss": 0.2851, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 0.7591781744695184, |
|
"grad_norm": 0.47002115845680237, |
|
"learning_rate": 2.4946503938643306e-06, |
|
"loss": 0.2293, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 0.7598518019535198, |
|
"grad_norm": 0.5585823655128479, |
|
"learning_rate": 2.4815204523085656e-06, |
|
"loss": 0.2893, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.7605254294375211, |
|
"grad_norm": 0.5366945266723633, |
|
"learning_rate": 2.4684183020726213e-06, |
|
"loss": 0.2358, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 0.7611990569215223, |
|
"grad_norm": 0.5874181985855103, |
|
"learning_rate": 2.4553440157135496e-06, |
|
"loss": 0.2795, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.7618726844055237, |
|
"grad_norm": 0.4842762351036072, |
|
"learning_rate": 2.442297665634085e-06, |
|
"loss": 0.2238, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 0.762546311889525, |
|
"grad_norm": 0.5179473161697388, |
|
"learning_rate": 2.4292793240822682e-06, |
|
"loss": 0.236, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 0.7632199393735264, |
|
"grad_norm": 0.5912408232688904, |
|
"learning_rate": 2.4162890631510233e-06, |
|
"loss": 0.2599, |
|
"step": 1133 |
|
}, |
|
{ |
|
"epoch": 0.7638935668575277, |
|
"grad_norm": 0.541069746017456, |
|
"learning_rate": 2.4033269547777788e-06, |
|
"loss": 0.2805, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 0.7645671943415291, |
|
"grad_norm": 0.5376386046409607, |
|
"learning_rate": 2.3903930707440584e-06, |
|
"loss": 0.2604, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.7652408218255304, |
|
"grad_norm": 0.5389772057533264, |
|
"learning_rate": 2.3774874826750796e-06, |
|
"loss": 0.2417, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 0.7659144493095318, |
|
"grad_norm": 0.49994543194770813, |
|
"learning_rate": 2.364610262039369e-06, |
|
"loss": 0.237, |
|
"step": 1137 |
|
}, |
|
{ |
|
"epoch": 0.7665880767935331, |
|
"grad_norm": 0.5099679827690125, |
|
"learning_rate": 2.351761480148358e-06, |
|
"loss": 0.2376, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 0.7672617042775345, |
|
"grad_norm": 0.5313609838485718, |
|
"learning_rate": 2.3389412081559842e-06, |
|
"loss": 0.2559, |
|
"step": 1139 |
|
}, |
|
{ |
|
"epoch": 0.7679353317615358, |
|
"grad_norm": 0.5471706390380859, |
|
"learning_rate": 2.326149517058314e-06, |
|
"loss": 0.2667, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.7686089592455372, |
|
"grad_norm": 0.4816801846027374, |
|
"learning_rate": 2.313386477693131e-06, |
|
"loss": 0.2245, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 0.7692825867295385, |
|
"grad_norm": 0.5714917182922363, |
|
"learning_rate": 2.3006521607395516e-06, |
|
"loss": 0.3004, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 0.7699562142135399, |
|
"grad_norm": 0.5020681619644165, |
|
"learning_rate": 2.2879466367176393e-06, |
|
"loss": 0.2477, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 0.7706298416975412, |
|
"grad_norm": 0.5577126145362854, |
|
"learning_rate": 2.275269975987998e-06, |
|
"loss": 0.2691, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 0.7713034691815426, |
|
"grad_norm": 0.481965035200119, |
|
"learning_rate": 2.262622248751405e-06, |
|
"loss": 0.2481, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.7719770966655439, |
|
"grad_norm": 0.5549229979515076, |
|
"learning_rate": 2.250003525048398e-06, |
|
"loss": 0.2568, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 0.7726507241495453, |
|
"grad_norm": 0.48651793599128723, |
|
"learning_rate": 2.2374138747589086e-06, |
|
"loss": 0.2255, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 0.7733243516335466, |
|
"grad_norm": 0.5344985723495483, |
|
"learning_rate": 2.224853367601858e-06, |
|
"loss": 0.2485, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 0.773997979117548, |
|
"grad_norm": 0.5281147360801697, |
|
"learning_rate": 2.212322073134783e-06, |
|
"loss": 0.2634, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 0.7746716066015493, |
|
"grad_norm": 0.5449070930480957, |
|
"learning_rate": 2.199820060753449e-06, |
|
"loss": 0.2726, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.7753452340855507, |
|
"grad_norm": 0.5562757253646851, |
|
"learning_rate": 2.187347399691457e-06, |
|
"loss": 0.2837, |
|
"step": 1151 |
|
}, |
|
{ |
|
"epoch": 0.776018861569552, |
|
"grad_norm": 0.5350236892700195, |
|
"learning_rate": 2.1749041590198664e-06, |
|
"loss": 0.2456, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.7766924890535534, |
|
"grad_norm": 0.5433318614959717, |
|
"learning_rate": 2.1624904076468215e-06, |
|
"loss": 0.2465, |
|
"step": 1153 |
|
}, |
|
{ |
|
"epoch": 0.7773661165375547, |
|
"grad_norm": 0.5183177590370178, |
|
"learning_rate": 2.1501062143171506e-06, |
|
"loss": 0.245, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 0.7780397440215561, |
|
"grad_norm": 0.5763646960258484, |
|
"learning_rate": 2.137751647611997e-06, |
|
"loss": 0.2403, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.7787133715055574, |
|
"grad_norm": 0.47587332129478455, |
|
"learning_rate": 2.125426775948446e-06, |
|
"loss": 0.2331, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 0.7793869989895588, |
|
"grad_norm": 0.4680344760417938, |
|
"learning_rate": 2.113131667579127e-06, |
|
"loss": 0.2246, |
|
"step": 1157 |
|
}, |
|
{ |
|
"epoch": 0.7800606264735601, |
|
"grad_norm": 0.5854060649871826, |
|
"learning_rate": 2.1008663905918553e-06, |
|
"loss": 0.3072, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 0.7807342539575615, |
|
"grad_norm": 0.48982059955596924, |
|
"learning_rate": 2.088631012909242e-06, |
|
"loss": 0.2257, |
|
"step": 1159 |
|
}, |
|
{ |
|
"epoch": 0.7814078814415628, |
|
"grad_norm": 0.522206723690033, |
|
"learning_rate": 2.0764256022883174e-06, |
|
"loss": 0.2607, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.7820815089255642, |
|
"grad_norm": 0.524056077003479, |
|
"learning_rate": 2.0642502263201687e-06, |
|
"loss": 0.2478, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 0.7827551364095655, |
|
"grad_norm": 0.49944427609443665, |
|
"learning_rate": 2.052104952429555e-06, |
|
"loss": 0.2371, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 0.7834287638935669, |
|
"grad_norm": 0.5648651719093323, |
|
"learning_rate": 2.0399898478745307e-06, |
|
"loss": 0.2341, |
|
"step": 1163 |
|
}, |
|
{ |
|
"epoch": 0.7841023913775682, |
|
"grad_norm": 0.5346094965934753, |
|
"learning_rate": 2.027904979746088e-06, |
|
"loss": 0.2694, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 0.7847760188615696, |
|
"grad_norm": 0.5461552739143372, |
|
"learning_rate": 2.0158504149677643e-06, |
|
"loss": 0.2178, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.7854496463455709, |
|
"grad_norm": 0.5188043713569641, |
|
"learning_rate": 2.003826220295295e-06, |
|
"loss": 0.2449, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 0.7861232738295723, |
|
"grad_norm": 0.5148183703422546, |
|
"learning_rate": 1.9918324623162253e-06, |
|
"loss": 0.2381, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 0.7867969013135736, |
|
"grad_norm": 0.550039529800415, |
|
"learning_rate": 1.979869207449545e-06, |
|
"loss": 0.2633, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 0.787470528797575, |
|
"grad_norm": 0.5604125261306763, |
|
"learning_rate": 1.9679365219453337e-06, |
|
"loss": 0.2605, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 0.7881441562815763, |
|
"grad_norm": 0.5368652939796448, |
|
"learning_rate": 1.9560344718843746e-06, |
|
"loss": 0.2725, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.7888177837655777, |
|
"grad_norm": 0.5414903163909912, |
|
"learning_rate": 1.9441631231778063e-06, |
|
"loss": 0.2505, |
|
"step": 1171 |
|
}, |
|
{ |
|
"epoch": 0.789491411249579, |
|
"grad_norm": 0.4991462528705597, |
|
"learning_rate": 1.932322541566743e-06, |
|
"loss": 0.2455, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 0.7901650387335803, |
|
"grad_norm": 0.4789801836013794, |
|
"learning_rate": 1.920512792621917e-06, |
|
"loss": 0.2252, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 0.7908386662175817, |
|
"grad_norm": 0.572302520275116, |
|
"learning_rate": 1.908733941743322e-06, |
|
"loss": 0.2803, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 0.791512293701583, |
|
"grad_norm": 0.5717800259590149, |
|
"learning_rate": 1.8969860541598358e-06, |
|
"loss": 0.2782, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.7921859211855844, |
|
"grad_norm": 0.5871665477752686, |
|
"learning_rate": 1.885269194928876e-06, |
|
"loss": 0.2884, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 0.7928595486695857, |
|
"grad_norm": 0.5179949402809143, |
|
"learning_rate": 1.8735834289360281e-06, |
|
"loss": 0.2484, |
|
"step": 1177 |
|
}, |
|
{ |
|
"epoch": 0.7935331761535871, |
|
"grad_norm": 0.5230392813682556, |
|
"learning_rate": 1.8619288208946858e-06, |
|
"loss": 0.244, |
|
"step": 1178 |
|
}, |
|
{ |
|
"epoch": 0.7942068036375884, |
|
"grad_norm": 0.4939616024494171, |
|
"learning_rate": 1.850305435345704e-06, |
|
"loss": 0.2202, |
|
"step": 1179 |
|
}, |
|
{ |
|
"epoch": 0.7948804311215898, |
|
"grad_norm": 0.5704658031463623, |
|
"learning_rate": 1.8387133366570284e-06, |
|
"loss": 0.2999, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.7955540586055911, |
|
"grad_norm": 0.5412735342979431, |
|
"learning_rate": 1.8271525890233412e-06, |
|
"loss": 0.254, |
|
"step": 1181 |
|
}, |
|
{ |
|
"epoch": 0.7962276860895925, |
|
"grad_norm": 0.5522796511650085, |
|
"learning_rate": 1.8156232564657204e-06, |
|
"loss": 0.256, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 0.7969013135735938, |
|
"grad_norm": 0.5172164440155029, |
|
"learning_rate": 1.8041254028312604e-06, |
|
"loss": 0.2408, |
|
"step": 1183 |
|
}, |
|
{ |
|
"epoch": 0.7975749410575952, |
|
"grad_norm": 0.5235007405281067, |
|
"learning_rate": 1.792659091792742e-06, |
|
"loss": 0.2455, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 0.7982485685415965, |
|
"grad_norm": 0.5019668340682983, |
|
"learning_rate": 1.781224386848265e-06, |
|
"loss": 0.2212, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.7989221960255979, |
|
"grad_norm": 0.5576637387275696, |
|
"learning_rate": 1.7698213513208983e-06, |
|
"loss": 0.2655, |
|
"step": 1186 |
|
}, |
|
{ |
|
"epoch": 0.7995958235095992, |
|
"grad_norm": 0.5962572693824768, |
|
"learning_rate": 1.758450048358339e-06, |
|
"loss": 0.2673, |
|
"step": 1187 |
|
}, |
|
{ |
|
"epoch": 0.8002694509936006, |
|
"grad_norm": 0.5175941586494446, |
|
"learning_rate": 1.7471105409325507e-06, |
|
"loss": 0.2609, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 0.8009430784776019, |
|
"grad_norm": 0.5470423698425293, |
|
"learning_rate": 1.7358028918394187e-06, |
|
"loss": 0.2781, |
|
"step": 1189 |
|
}, |
|
{ |
|
"epoch": 0.8016167059616033, |
|
"grad_norm": 0.5484721660614014, |
|
"learning_rate": 1.7245271636984072e-06, |
|
"loss": 0.2503, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.8022903334456046, |
|
"grad_norm": 0.5539147257804871, |
|
"learning_rate": 1.7132834189522075e-06, |
|
"loss": 0.2697, |
|
"step": 1191 |
|
}, |
|
{ |
|
"epoch": 0.802963960929606, |
|
"grad_norm": 0.5356603860855103, |
|
"learning_rate": 1.7020717198663948e-06, |
|
"loss": 0.2343, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 0.8036375884136073, |
|
"grad_norm": 0.5115563273429871, |
|
"learning_rate": 1.690892128529078e-06, |
|
"loss": 0.2507, |
|
"step": 1193 |
|
}, |
|
{ |
|
"epoch": 0.8043112158976087, |
|
"grad_norm": 0.5782252550125122, |
|
"learning_rate": 1.6797447068505604e-06, |
|
"loss": 0.2993, |
|
"step": 1194 |
|
}, |
|
{ |
|
"epoch": 0.8049848433816099, |
|
"grad_norm": 0.4831259548664093, |
|
"learning_rate": 1.6686295165630005e-06, |
|
"loss": 0.2095, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.8056584708656113, |
|
"grad_norm": 0.5409023761749268, |
|
"learning_rate": 1.6575466192200609e-06, |
|
"loss": 0.2591, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 0.8063320983496126, |
|
"grad_norm": 0.510886013507843, |
|
"learning_rate": 1.6464960761965773e-06, |
|
"loss": 0.2221, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 0.807005725833614, |
|
"grad_norm": 0.5136117935180664, |
|
"learning_rate": 1.635477948688209e-06, |
|
"loss": 0.2306, |
|
"step": 1198 |
|
}, |
|
{ |
|
"epoch": 0.8076793533176153, |
|
"grad_norm": 0.5284943580627441, |
|
"learning_rate": 1.624492297711106e-06, |
|
"loss": 0.2497, |
|
"step": 1199 |
|
}, |
|
{ |
|
"epoch": 0.8083529808016167, |
|
"grad_norm": 0.547935426235199, |
|
"learning_rate": 1.6135391841015749e-06, |
|
"loss": 0.237, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8083529808016167, |
|
"eval_loss": 0.24800407886505127, |
|
"eval_runtime": 104.1907, |
|
"eval_samples_per_second": 47.989, |
|
"eval_steps_per_second": 3.004, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.809026608285618, |
|
"grad_norm": 0.5304118990898132, |
|
"learning_rate": 1.6026186685157299e-06, |
|
"loss": 0.2637, |
|
"step": 1201 |
|
}, |
|
{ |
|
"epoch": 0.8097002357696194, |
|
"grad_norm": 0.5001785755157471, |
|
"learning_rate": 1.591730811429165e-06, |
|
"loss": 0.2512, |
|
"step": 1202 |
|
}, |
|
{ |
|
"epoch": 0.8103738632536207, |
|
"grad_norm": 0.5340592265129089, |
|
"learning_rate": 1.5808756731366246e-06, |
|
"loss": 0.2356, |
|
"step": 1203 |
|
}, |
|
{ |
|
"epoch": 0.8110474907376221, |
|
"grad_norm": 0.5753365755081177, |
|
"learning_rate": 1.5700533137516538e-06, |
|
"loss": 0.2411, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 0.8117211182216234, |
|
"grad_norm": 0.5412161946296692, |
|
"learning_rate": 1.559263793206282e-06, |
|
"loss": 0.259, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.8123947457056248, |
|
"grad_norm": 0.5372704267501831, |
|
"learning_rate": 1.5485071712506836e-06, |
|
"loss": 0.2583, |
|
"step": 1206 |
|
}, |
|
{ |
|
"epoch": 0.8130683731896261, |
|
"grad_norm": 0.5177714228630066, |
|
"learning_rate": 1.5377835074528396e-06, |
|
"loss": 0.2566, |
|
"step": 1207 |
|
}, |
|
{ |
|
"epoch": 0.8137420006736275, |
|
"grad_norm": 0.5761350989341736, |
|
"learning_rate": 1.5270928611982252e-06, |
|
"loss": 0.2748, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 0.8144156281576288, |
|
"grad_norm": 0.4516087770462036, |
|
"learning_rate": 1.5164352916894639e-06, |
|
"loss": 0.2042, |
|
"step": 1209 |
|
}, |
|
{ |
|
"epoch": 0.8150892556416302, |
|
"grad_norm": 0.5373425483703613, |
|
"learning_rate": 1.5058108579460117e-06, |
|
"loss": 0.2473, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.8157628831256315, |
|
"grad_norm": 0.5737300515174866, |
|
"learning_rate": 1.4952196188038232e-06, |
|
"loss": 0.2378, |
|
"step": 1211 |
|
}, |
|
{ |
|
"epoch": 0.8164365106096328, |
|
"grad_norm": 0.5578494071960449, |
|
"learning_rate": 1.4846616329150252e-06, |
|
"loss": 0.2455, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 0.8171101380936342, |
|
"grad_norm": 0.5392588376998901, |
|
"learning_rate": 1.4741369587476023e-06, |
|
"loss": 0.2587, |
|
"step": 1213 |
|
}, |
|
{ |
|
"epoch": 0.8177837655776355, |
|
"grad_norm": 0.5129286050796509, |
|
"learning_rate": 1.4636456545850584e-06, |
|
"loss": 0.2269, |
|
"step": 1214 |
|
}, |
|
{ |
|
"epoch": 0.8184573930616369, |
|
"grad_norm": 0.5656298398971558, |
|
"learning_rate": 1.4531877785261032e-06, |
|
"loss": 0.25, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.8191310205456382, |
|
"grad_norm": 0.5242322087287903, |
|
"learning_rate": 1.4427633884843321e-06, |
|
"loss": 0.2213, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 0.8198046480296396, |
|
"grad_norm": 0.5498111248016357, |
|
"learning_rate": 1.432372542187895e-06, |
|
"loss": 0.2842, |
|
"step": 1217 |
|
}, |
|
{ |
|
"epoch": 0.8204782755136409, |
|
"grad_norm": 0.5031629204750061, |
|
"learning_rate": 1.42201529717919e-06, |
|
"loss": 0.2404, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 0.8211519029976423, |
|
"grad_norm": 0.5014514327049255, |
|
"learning_rate": 1.4116917108145318e-06, |
|
"loss": 0.2447, |
|
"step": 1219 |
|
}, |
|
{ |
|
"epoch": 0.8218255304816436, |
|
"grad_norm": 0.6042701005935669, |
|
"learning_rate": 1.4014018402638454e-06, |
|
"loss": 0.2935, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.822499157965645, |
|
"grad_norm": 0.4977283179759979, |
|
"learning_rate": 1.3911457425103444e-06, |
|
"loss": 0.2463, |
|
"step": 1221 |
|
}, |
|
{ |
|
"epoch": 0.8231727854496463, |
|
"grad_norm": 0.5383139252662659, |
|
"learning_rate": 1.3809234743502109e-06, |
|
"loss": 0.2432, |
|
"step": 1222 |
|
}, |
|
{ |
|
"epoch": 0.8238464129336477, |
|
"grad_norm": 0.4999409019947052, |
|
"learning_rate": 1.3707350923922915e-06, |
|
"loss": 0.2427, |
|
"step": 1223 |
|
}, |
|
{ |
|
"epoch": 0.824520040417649, |
|
"grad_norm": 0.5174874067306519, |
|
"learning_rate": 1.3605806530577725e-06, |
|
"loss": 0.2475, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 0.8251936679016504, |
|
"grad_norm": 0.545793890953064, |
|
"learning_rate": 1.3504602125798742e-06, |
|
"loss": 0.26, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.8258672953856517, |
|
"grad_norm": 0.5483867526054382, |
|
"learning_rate": 1.340373827003543e-06, |
|
"loss": 0.2454, |
|
"step": 1226 |
|
}, |
|
{ |
|
"epoch": 0.8265409228696531, |
|
"grad_norm": 0.5197309851646423, |
|
"learning_rate": 1.3303215521851303e-06, |
|
"loss": 0.2109, |
|
"step": 1227 |
|
}, |
|
{ |
|
"epoch": 0.8272145503536544, |
|
"grad_norm": 0.5258607864379883, |
|
"learning_rate": 1.3203034437920889e-06, |
|
"loss": 0.2473, |
|
"step": 1228 |
|
}, |
|
{ |
|
"epoch": 0.8278881778376558, |
|
"grad_norm": 0.5303134322166443, |
|
"learning_rate": 1.3103195573026708e-06, |
|
"loss": 0.2348, |
|
"step": 1229 |
|
}, |
|
{ |
|
"epoch": 0.8285618053216571, |
|
"grad_norm": 0.6092815399169922, |
|
"learning_rate": 1.3003699480056073e-06, |
|
"loss": 0.3257, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.8292354328056585, |
|
"grad_norm": 0.5898075103759766, |
|
"learning_rate": 1.2904546709998153e-06, |
|
"loss": 0.2941, |
|
"step": 1231 |
|
}, |
|
{ |
|
"epoch": 0.8299090602896598, |
|
"grad_norm": 0.519887387752533, |
|
"learning_rate": 1.2805737811940814e-06, |
|
"loss": 0.2452, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 0.8305826877736612, |
|
"grad_norm": 0.5605584979057312, |
|
"learning_rate": 1.2707273333067675e-06, |
|
"loss": 0.254, |
|
"step": 1233 |
|
}, |
|
{ |
|
"epoch": 0.8312563152576625, |
|
"grad_norm": 0.5284910202026367, |
|
"learning_rate": 1.2609153818654983e-06, |
|
"loss": 0.2709, |
|
"step": 1234 |
|
}, |
|
{ |
|
"epoch": 0.8319299427416639, |
|
"grad_norm": 0.5385324358940125, |
|
"learning_rate": 1.2511379812068683e-06, |
|
"loss": 0.2483, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.8326035702256652, |
|
"grad_norm": 0.5575158596038818, |
|
"learning_rate": 1.2413951854761364e-06, |
|
"loss": 0.2434, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 0.8332771977096666, |
|
"grad_norm": 0.4902471601963043, |
|
"learning_rate": 1.231687048626925e-06, |
|
"loss": 0.2183, |
|
"step": 1237 |
|
}, |
|
{ |
|
"epoch": 0.8339508251936679, |
|
"grad_norm": 0.4895704984664917, |
|
"learning_rate": 1.22201362442092e-06, |
|
"loss": 0.2555, |
|
"step": 1238 |
|
}, |
|
{ |
|
"epoch": 0.8346244526776693, |
|
"grad_norm": 0.5114362835884094, |
|
"learning_rate": 1.2123749664275823e-06, |
|
"loss": 0.2474, |
|
"step": 1239 |
|
}, |
|
{ |
|
"epoch": 0.8352980801616706, |
|
"grad_norm": 0.5052047371864319, |
|
"learning_rate": 1.2027711280238396e-06, |
|
"loss": 0.2158, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.835971707645672, |
|
"grad_norm": 0.549039900302887, |
|
"learning_rate": 1.1932021623937954e-06, |
|
"loss": 0.2728, |
|
"step": 1241 |
|
}, |
|
{ |
|
"epoch": 0.8366453351296733, |
|
"grad_norm": 0.5076087117195129, |
|
"learning_rate": 1.1836681225284401e-06, |
|
"loss": 0.248, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 0.8373189626136747, |
|
"grad_norm": 0.5593048930168152, |
|
"learning_rate": 1.1741690612253455e-06, |
|
"loss": 0.2778, |
|
"step": 1243 |
|
}, |
|
{ |
|
"epoch": 0.837992590097676, |
|
"grad_norm": 0.557877242565155, |
|
"learning_rate": 1.1647050310883855e-06, |
|
"loss": 0.2744, |
|
"step": 1244 |
|
}, |
|
{ |
|
"epoch": 0.8386662175816774, |
|
"grad_norm": 0.571789562702179, |
|
"learning_rate": 1.155276084527435e-06, |
|
"loss": 0.2623, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.8393398450656787, |
|
"grad_norm": 0.532217800617218, |
|
"learning_rate": 1.1458822737580804e-06, |
|
"loss": 0.2604, |
|
"step": 1246 |
|
}, |
|
{ |
|
"epoch": 0.8400134725496801, |
|
"grad_norm": 0.488652765750885, |
|
"learning_rate": 1.1365236508013396e-06, |
|
"loss": 0.2302, |
|
"step": 1247 |
|
}, |
|
{ |
|
"epoch": 0.8406871000336814, |
|
"grad_norm": 0.48154351115226746, |
|
"learning_rate": 1.1272002674833668e-06, |
|
"loss": 0.2292, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 0.8413607275176828, |
|
"grad_norm": 0.5416498184204102, |
|
"learning_rate": 1.1179121754351587e-06, |
|
"loss": 0.2675, |
|
"step": 1249 |
|
}, |
|
{ |
|
"epoch": 0.8420343550016841, |
|
"grad_norm": 0.5223404169082642, |
|
"learning_rate": 1.1086594260922873e-06, |
|
"loss": 0.2495, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.8427079824856855, |
|
"grad_norm": 0.4795687198638916, |
|
"learning_rate": 1.0994420706945922e-06, |
|
"loss": 0.2405, |
|
"step": 1251 |
|
}, |
|
{ |
|
"epoch": 0.8433816099696868, |
|
"grad_norm": 0.528590202331543, |
|
"learning_rate": 1.0902601602859192e-06, |
|
"loss": 0.234, |
|
"step": 1252 |
|
}, |
|
{ |
|
"epoch": 0.8440552374536882, |
|
"grad_norm": 0.5488812327384949, |
|
"learning_rate": 1.0811137457138195e-06, |
|
"loss": 0.2309, |
|
"step": 1253 |
|
}, |
|
{ |
|
"epoch": 0.8447288649376895, |
|
"grad_norm": 0.48816734552383423, |
|
"learning_rate": 1.0720028776292775e-06, |
|
"loss": 0.2252, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 0.8454024924216909, |
|
"grad_norm": 0.4672640860080719, |
|
"learning_rate": 1.0629276064864315e-06, |
|
"loss": 0.2241, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.8460761199056922, |
|
"grad_norm": 0.6129331588745117, |
|
"learning_rate": 1.053887982542286e-06, |
|
"loss": 0.2933, |
|
"step": 1256 |
|
}, |
|
{ |
|
"epoch": 0.8467497473896936, |
|
"grad_norm": 0.4707486033439636, |
|
"learning_rate": 1.0448840558564437e-06, |
|
"loss": 0.2263, |
|
"step": 1257 |
|
}, |
|
{ |
|
"epoch": 0.8474233748736949, |
|
"grad_norm": 0.5238653421401978, |
|
"learning_rate": 1.0359158762908206e-06, |
|
"loss": 0.2251, |
|
"step": 1258 |
|
}, |
|
{ |
|
"epoch": 0.8480970023576961, |
|
"grad_norm": 0.5185546278953552, |
|
"learning_rate": 1.0269834935093692e-06, |
|
"loss": 0.2423, |
|
"step": 1259 |
|
}, |
|
{ |
|
"epoch": 0.8487706298416975, |
|
"grad_norm": 0.551475465297699, |
|
"learning_rate": 1.0180869569778146e-06, |
|
"loss": 0.252, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.8494442573256988, |
|
"grad_norm": 0.5508469343185425, |
|
"learning_rate": 1.0092263159633643e-06, |
|
"loss": 0.2689, |
|
"step": 1261 |
|
}, |
|
{ |
|
"epoch": 0.8501178848097002, |
|
"grad_norm": 0.5417614579200745, |
|
"learning_rate": 1.000401619534449e-06, |
|
"loss": 0.2693, |
|
"step": 1262 |
|
}, |
|
{ |
|
"epoch": 0.8507915122937015, |
|
"grad_norm": 0.5418084859848022, |
|
"learning_rate": 9.91612916560445e-07, |
|
"loss": 0.2355, |
|
"step": 1263 |
|
}, |
|
{ |
|
"epoch": 0.8514651397777029, |
|
"grad_norm": 0.49578985571861267, |
|
"learning_rate": 9.828602557114017e-07, |
|
"loss": 0.2373, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 0.8521387672617042, |
|
"grad_norm": 0.5037760734558105, |
|
"learning_rate": 9.741436854577778e-07, |
|
"loss": 0.2109, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.8528123947457056, |
|
"grad_norm": 0.5789549946784973, |
|
"learning_rate": 9.654632540701663e-07, |
|
"loss": 0.2314, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 0.8534860222297069, |
|
"grad_norm": 0.5235623717308044, |
|
"learning_rate": 9.568190096190321e-07, |
|
"loss": 0.2648, |
|
"step": 1267 |
|
}, |
|
{ |
|
"epoch": 0.8541596497137083, |
|
"grad_norm": 0.5276992917060852, |
|
"learning_rate": 9.482109999744456e-07, |
|
"loss": 0.2422, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 0.8548332771977096, |
|
"grad_norm": 0.5321447253227234, |
|
"learning_rate": 9.396392728058129e-07, |
|
"loss": 0.2257, |
|
"step": 1269 |
|
}, |
|
{ |
|
"epoch": 0.855506904681711, |
|
"grad_norm": 0.5687943696975708, |
|
"learning_rate": 9.311038755816187e-07, |
|
"loss": 0.2534, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.8561805321657123, |
|
"grad_norm": 0.5804548263549805, |
|
"learning_rate": 9.226048555691583e-07, |
|
"loss": 0.2888, |
|
"step": 1271 |
|
}, |
|
{ |
|
"epoch": 0.8568541596497137, |
|
"grad_norm": 0.603203535079956, |
|
"learning_rate": 9.141422598342745e-07, |
|
"loss": 0.2695, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 0.857527787133715, |
|
"grad_norm": 0.49607640504837036, |
|
"learning_rate": 9.057161352411055e-07, |
|
"loss": 0.2329, |
|
"step": 1273 |
|
}, |
|
{ |
|
"epoch": 0.8582014146177164, |
|
"grad_norm": 0.5365235209465027, |
|
"learning_rate": 8.973265284518168e-07, |
|
"loss": 0.2576, |
|
"step": 1274 |
|
}, |
|
{ |
|
"epoch": 0.8588750421017177, |
|
"grad_norm": 0.4797859191894531, |
|
"learning_rate": 8.889734859263429e-07, |
|
"loss": 0.2337, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.8595486695857191, |
|
"grad_norm": 0.5661630034446716, |
|
"learning_rate": 8.806570539221378e-07, |
|
"loss": 0.2612, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 0.8602222970697204, |
|
"grad_norm": 0.5369821190834045, |
|
"learning_rate": 8.723772784939132e-07, |
|
"loss": 0.2509, |
|
"step": 1277 |
|
}, |
|
{ |
|
"epoch": 0.8608959245537218, |
|
"grad_norm": 0.6320977807044983, |
|
"learning_rate": 8.641342054933799e-07, |
|
"loss": 0.2837, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 0.8615695520377231, |
|
"grad_norm": 0.605116069316864, |
|
"learning_rate": 8.559278805690027e-07, |
|
"loss": 0.3332, |
|
"step": 1279 |
|
}, |
|
{ |
|
"epoch": 0.8622431795217245, |
|
"grad_norm": 0.5639594793319702, |
|
"learning_rate": 8.477583491657404e-07, |
|
"loss": 0.267, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.8629168070057258, |
|
"grad_norm": 0.5377295613288879, |
|
"learning_rate": 8.396256565247987e-07, |
|
"loss": 0.2624, |
|
"step": 1281 |
|
}, |
|
{ |
|
"epoch": 0.8635904344897272, |
|
"grad_norm": 0.5511677265167236, |
|
"learning_rate": 8.315298476833749e-07, |
|
"loss": 0.2253, |
|
"step": 1282 |
|
}, |
|
{ |
|
"epoch": 0.8642640619737285, |
|
"grad_norm": 0.5488938093185425, |
|
"learning_rate": 8.234709674744156e-07, |
|
"loss": 0.2745, |
|
"step": 1283 |
|
}, |
|
{ |
|
"epoch": 0.8649376894577299, |
|
"grad_norm": 0.538172721862793, |
|
"learning_rate": 8.154490605263592e-07, |
|
"loss": 0.2664, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 0.8656113169417312, |
|
"grad_norm": 0.46900901198387146, |
|
"learning_rate": 8.074641712628963e-07, |
|
"loss": 0.2223, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.8662849444257326, |
|
"grad_norm": 0.5262213349342346, |
|
"learning_rate": 7.995163439027223e-07, |
|
"loss": 0.2444, |
|
"step": 1286 |
|
}, |
|
{ |
|
"epoch": 0.8669585719097339, |
|
"grad_norm": 0.5740031003952026, |
|
"learning_rate": 7.916056224592899e-07, |
|
"loss": 0.3013, |
|
"step": 1287 |
|
}, |
|
{ |
|
"epoch": 0.8676321993937353, |
|
"grad_norm": 0.48303380608558655, |
|
"learning_rate": 7.837320507405633e-07, |
|
"loss": 0.2352, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 0.8683058268777366, |
|
"grad_norm": 0.5323200821876526, |
|
"learning_rate": 7.758956723487872e-07, |
|
"loss": 0.2453, |
|
"step": 1289 |
|
}, |
|
{ |
|
"epoch": 0.868979454361738, |
|
"grad_norm": 0.5278131365776062, |
|
"learning_rate": 7.680965306802288e-07, |
|
"loss": 0.227, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.8696530818457393, |
|
"grad_norm": 0.4973738491535187, |
|
"learning_rate": 7.603346689249515e-07, |
|
"loss": 0.2294, |
|
"step": 1291 |
|
}, |
|
{ |
|
"epoch": 0.8703267093297407, |
|
"grad_norm": 0.4885808825492859, |
|
"learning_rate": 7.526101300665692e-07, |
|
"loss": 0.2251, |
|
"step": 1292 |
|
}, |
|
{ |
|
"epoch": 0.871000336813742, |
|
"grad_norm": 0.5182288289070129, |
|
"learning_rate": 7.44922956882006e-07, |
|
"loss": 0.2279, |
|
"step": 1293 |
|
}, |
|
{ |
|
"epoch": 0.8716739642977434, |
|
"grad_norm": 0.6276636123657227, |
|
"learning_rate": 7.37273191941267e-07, |
|
"loss": 0.2574, |
|
"step": 1294 |
|
}, |
|
{ |
|
"epoch": 0.8723475917817447, |
|
"grad_norm": 0.514959454536438, |
|
"learning_rate": 7.296608776071931e-07, |
|
"loss": 0.2344, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.873021219265746, |
|
"grad_norm": 0.5535395741462708, |
|
"learning_rate": 7.220860560352365e-07, |
|
"loss": 0.2702, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 0.8736948467497474, |
|
"grad_norm": 0.538652241230011, |
|
"learning_rate": 7.145487691732194e-07, |
|
"loss": 0.2414, |
|
"step": 1297 |
|
}, |
|
{ |
|
"epoch": 0.8743684742337487, |
|
"grad_norm": 0.4960603415966034, |
|
"learning_rate": 7.070490587611014e-07, |
|
"loss": 0.2188, |
|
"step": 1298 |
|
}, |
|
{ |
|
"epoch": 0.8750421017177501, |
|
"grad_norm": 0.6888505220413208, |
|
"learning_rate": 6.995869663307588e-07, |
|
"loss": 0.2467, |
|
"step": 1299 |
|
}, |
|
{ |
|
"epoch": 0.8757157292017514, |
|
"grad_norm": 0.5374876856803894, |
|
"learning_rate": 6.921625332057413e-07, |
|
"loss": 0.2615, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8757157292017514, |
|
"eval_loss": 0.24685746431350708, |
|
"eval_runtime": 105.528, |
|
"eval_samples_per_second": 47.381, |
|
"eval_steps_per_second": 2.966, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8763893566857528, |
|
"grad_norm": 0.5609838962554932, |
|
"learning_rate": 6.847758005010493e-07, |
|
"loss": 0.2512, |
|
"step": 1301 |
|
}, |
|
{ |
|
"epoch": 0.8770629841697541, |
|
"grad_norm": 0.5215060114860535, |
|
"learning_rate": 6.774268091229097e-07, |
|
"loss": 0.2401, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 0.8777366116537555, |
|
"grad_norm": 0.5238656401634216, |
|
"learning_rate": 6.701155997685413e-07, |
|
"loss": 0.2291, |
|
"step": 1303 |
|
}, |
|
{ |
|
"epoch": 0.8784102391377568, |
|
"grad_norm": 0.5399895906448364, |
|
"learning_rate": 6.628422129259371e-07, |
|
"loss": 0.2594, |
|
"step": 1304 |
|
}, |
|
{ |
|
"epoch": 0.8790838666217582, |
|
"grad_norm": 0.5661953687667847, |
|
"learning_rate": 6.556066888736334e-07, |
|
"loss": 0.2781, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.8797574941057595, |
|
"grad_norm": 0.5317832827568054, |
|
"learning_rate": 6.484090676804927e-07, |
|
"loss": 0.2365, |
|
"step": 1306 |
|
}, |
|
{ |
|
"epoch": 0.8804311215897609, |
|
"grad_norm": 0.5083217024803162, |
|
"learning_rate": 6.412493892054802e-07, |
|
"loss": 0.251, |
|
"step": 1307 |
|
}, |
|
{ |
|
"epoch": 0.8811047490737622, |
|
"grad_norm": 0.5453861951828003, |
|
"learning_rate": 6.341276930974377e-07, |
|
"loss": 0.2472, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 0.8817783765577636, |
|
"grad_norm": 0.524014949798584, |
|
"learning_rate": 6.270440187948734e-07, |
|
"loss": 0.2392, |
|
"step": 1309 |
|
}, |
|
{ |
|
"epoch": 0.8824520040417649, |
|
"grad_norm": 0.5500375628471375, |
|
"learning_rate": 6.19998405525734e-07, |
|
"loss": 0.2429, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.8831256315257663, |
|
"grad_norm": 0.4858246445655823, |
|
"learning_rate": 6.129908923071933e-07, |
|
"loss": 0.2301, |
|
"step": 1311 |
|
}, |
|
{ |
|
"epoch": 0.8837992590097676, |
|
"grad_norm": 0.524882972240448, |
|
"learning_rate": 6.060215179454379e-07, |
|
"loss": 0.265, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 0.884472886493769, |
|
"grad_norm": 0.47017255425453186, |
|
"learning_rate": 5.990903210354456e-07, |
|
"loss": 0.2178, |
|
"step": 1313 |
|
}, |
|
{ |
|
"epoch": 0.8851465139777703, |
|
"grad_norm": 0.5531392097473145, |
|
"learning_rate": 5.921973399607738e-07, |
|
"loss": 0.2613, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 0.8858201414617717, |
|
"grad_norm": 0.5758329033851624, |
|
"learning_rate": 5.853426128933548e-07, |
|
"loss": 0.2408, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.886493768945773, |
|
"grad_norm": 0.5558485984802246, |
|
"learning_rate": 5.78526177793271e-07, |
|
"loss": 0.271, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 0.8871673964297744, |
|
"grad_norm": 0.5643607974052429, |
|
"learning_rate": 5.717480724085564e-07, |
|
"loss": 0.2524, |
|
"step": 1317 |
|
}, |
|
{ |
|
"epoch": 0.8878410239137757, |
|
"grad_norm": 0.5513696670532227, |
|
"learning_rate": 5.650083342749796e-07, |
|
"loss": 0.271, |
|
"step": 1318 |
|
}, |
|
{ |
|
"epoch": 0.8885146513977771, |
|
"grad_norm": 0.5440685749053955, |
|
"learning_rate": 5.583070007158425e-07, |
|
"loss": 0.2397, |
|
"step": 1319 |
|
}, |
|
{ |
|
"epoch": 0.8891882788817784, |
|
"grad_norm": 0.5702263712882996, |
|
"learning_rate": 5.516441088417665e-07, |
|
"loss": 0.2512, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.8898619063657798, |
|
"grad_norm": 0.5015043020248413, |
|
"learning_rate": 5.450196955504946e-07, |
|
"loss": 0.2414, |
|
"step": 1321 |
|
}, |
|
{ |
|
"epoch": 0.8905355338497811, |
|
"grad_norm": 0.5015976428985596, |
|
"learning_rate": 5.384337975266789e-07, |
|
"loss": 0.2394, |
|
"step": 1322 |
|
}, |
|
{ |
|
"epoch": 0.8912091613337825, |
|
"grad_norm": 0.5432824492454529, |
|
"learning_rate": 5.318864512416871e-07, |
|
"loss": 0.2451, |
|
"step": 1323 |
|
}, |
|
{ |
|
"epoch": 0.8918827888177837, |
|
"grad_norm": 0.550553023815155, |
|
"learning_rate": 5.253776929533898e-07, |
|
"loss": 0.229, |
|
"step": 1324 |
|
}, |
|
{ |
|
"epoch": 0.8925564163017851, |
|
"grad_norm": 0.49185919761657715, |
|
"learning_rate": 5.1890755870597e-07, |
|
"loss": 0.2408, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.8932300437857864, |
|
"grad_norm": 0.5123654007911682, |
|
"learning_rate": 5.124760843297144e-07, |
|
"loss": 0.2529, |
|
"step": 1326 |
|
}, |
|
{ |
|
"epoch": 0.8939036712697878, |
|
"grad_norm": 0.5548482537269592, |
|
"learning_rate": 5.060833054408206e-07, |
|
"loss": 0.2219, |
|
"step": 1327 |
|
}, |
|
{ |
|
"epoch": 0.8945772987537891, |
|
"grad_norm": 0.5485543608665466, |
|
"learning_rate": 4.997292574412019e-07, |
|
"loss": 0.281, |
|
"step": 1328 |
|
}, |
|
{ |
|
"epoch": 0.8952509262377905, |
|
"grad_norm": 0.49521514773368835, |
|
"learning_rate": 4.934139755182801e-07, |
|
"loss": 0.2481, |
|
"step": 1329 |
|
}, |
|
{ |
|
"epoch": 0.8959245537217918, |
|
"grad_norm": 0.5502017140388489, |
|
"learning_rate": 4.871374946448077e-07, |
|
"loss": 0.2141, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.8965981812057932, |
|
"grad_norm": 0.5505282878875732, |
|
"learning_rate": 4.808998495786577e-07, |
|
"loss": 0.2761, |
|
"step": 1331 |
|
}, |
|
{ |
|
"epoch": 0.8972718086897945, |
|
"grad_norm": 0.5089839696884155, |
|
"learning_rate": 4.747010748626404e-07, |
|
"loss": 0.2086, |
|
"step": 1332 |
|
}, |
|
{ |
|
"epoch": 0.8979454361737959, |
|
"grad_norm": 0.556952953338623, |
|
"learning_rate": 4.685412048243118e-07, |
|
"loss": 0.2672, |
|
"step": 1333 |
|
}, |
|
{ |
|
"epoch": 0.8986190636577972, |
|
"grad_norm": 0.5195740461349487, |
|
"learning_rate": 4.6242027357577903e-07, |
|
"loss": 0.2466, |
|
"step": 1334 |
|
}, |
|
{ |
|
"epoch": 0.8992926911417986, |
|
"grad_norm": 0.528290867805481, |
|
"learning_rate": 4.5633831501351616e-07, |
|
"loss": 0.24, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.8999663186257999, |
|
"grad_norm": 0.5216066241264343, |
|
"learning_rate": 4.5029536281817386e-07, |
|
"loss": 0.2312, |
|
"step": 1336 |
|
}, |
|
{ |
|
"epoch": 0.9006399461098012, |
|
"grad_norm": 0.473034143447876, |
|
"learning_rate": 4.442914504543924e-07, |
|
"loss": 0.2353, |
|
"step": 1337 |
|
}, |
|
{ |
|
"epoch": 0.9013135735938026, |
|
"grad_norm": 0.5582137703895569, |
|
"learning_rate": 4.3832661117061993e-07, |
|
"loss": 0.2657, |
|
"step": 1338 |
|
}, |
|
{ |
|
"epoch": 0.901987201077804, |
|
"grad_norm": 0.5023413300514221, |
|
"learning_rate": 4.3240087799892357e-07, |
|
"loss": 0.2335, |
|
"step": 1339 |
|
}, |
|
{ |
|
"epoch": 0.9026608285618053, |
|
"grad_norm": 0.5657601952552795, |
|
"learning_rate": 4.2651428375480694e-07, |
|
"loss": 0.2517, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.9033344560458066, |
|
"grad_norm": 0.6110347509384155, |
|
"learning_rate": 4.206668610370362e-07, |
|
"loss": 0.2873, |
|
"step": 1341 |
|
}, |
|
{ |
|
"epoch": 0.904008083529808, |
|
"grad_norm": 0.5331605076789856, |
|
"learning_rate": 4.14858642227447e-07, |
|
"loss": 0.2316, |
|
"step": 1342 |
|
}, |
|
{ |
|
"epoch": 0.9046817110138093, |
|
"grad_norm": 0.5356966853141785, |
|
"learning_rate": 4.090896594907767e-07, |
|
"loss": 0.2124, |
|
"step": 1343 |
|
}, |
|
{ |
|
"epoch": 0.9053553384978107, |
|
"grad_norm": 0.5137231349945068, |
|
"learning_rate": 4.033599447744785e-07, |
|
"loss": 0.2475, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 0.906028965981812, |
|
"grad_norm": 0.5301010012626648, |
|
"learning_rate": 3.9766952980854755e-07, |
|
"loss": 0.2319, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.9067025934658134, |
|
"grad_norm": 0.5305171012878418, |
|
"learning_rate": 3.9201844610534667e-07, |
|
"loss": 0.2307, |
|
"step": 1346 |
|
}, |
|
{ |
|
"epoch": 0.9073762209498147, |
|
"grad_norm": 0.5359605550765991, |
|
"learning_rate": 3.8640672495942777e-07, |
|
"loss": 0.2406, |
|
"step": 1347 |
|
}, |
|
{ |
|
"epoch": 0.9080498484338161, |
|
"grad_norm": 0.534532904624939, |
|
"learning_rate": 3.8083439744736296e-07, |
|
"loss": 0.263, |
|
"step": 1348 |
|
}, |
|
{ |
|
"epoch": 0.9087234759178174, |
|
"grad_norm": 0.5522202253341675, |
|
"learning_rate": 3.75301494427569e-07, |
|
"loss": 0.257, |
|
"step": 1349 |
|
}, |
|
{ |
|
"epoch": 0.9093971034018188, |
|
"grad_norm": 0.5170401334762573, |
|
"learning_rate": 3.6980804654013794e-07, |
|
"loss": 0.2534, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.9100707308858201, |
|
"grad_norm": 0.542862594127655, |
|
"learning_rate": 3.643540842066692e-07, |
|
"loss": 0.2502, |
|
"step": 1351 |
|
}, |
|
{ |
|
"epoch": 0.9107443583698215, |
|
"grad_norm": 0.5424035787582397, |
|
"learning_rate": 3.5893963763009713e-07, |
|
"loss": 0.2531, |
|
"step": 1352 |
|
}, |
|
{ |
|
"epoch": 0.9114179858538228, |
|
"grad_norm": 0.517169177532196, |
|
"learning_rate": 3.5356473679452524e-07, |
|
"loss": 0.2209, |
|
"step": 1353 |
|
}, |
|
{ |
|
"epoch": 0.9120916133378242, |
|
"grad_norm": 0.536840558052063, |
|
"learning_rate": 3.482294114650639e-07, |
|
"loss": 0.2681, |
|
"step": 1354 |
|
}, |
|
{ |
|
"epoch": 0.9127652408218255, |
|
"grad_norm": 0.6323632001876831, |
|
"learning_rate": 3.4293369118765794e-07, |
|
"loss": 0.3221, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.9134388683058269, |
|
"grad_norm": 0.5508329272270203, |
|
"learning_rate": 3.3767760528893356e-07, |
|
"loss": 0.2675, |
|
"step": 1356 |
|
}, |
|
{ |
|
"epoch": 0.9141124957898282, |
|
"grad_norm": 0.5150956511497498, |
|
"learning_rate": 3.324611828760241e-07, |
|
"loss": 0.2383, |
|
"step": 1357 |
|
}, |
|
{ |
|
"epoch": 0.9147861232738296, |
|
"grad_norm": 0.5284437537193298, |
|
"learning_rate": 3.272844528364161e-07, |
|
"loss": 0.2326, |
|
"step": 1358 |
|
}, |
|
{ |
|
"epoch": 0.9154597507578309, |
|
"grad_norm": 0.4829590618610382, |
|
"learning_rate": 3.221474438377903e-07, |
|
"loss": 0.2247, |
|
"step": 1359 |
|
}, |
|
{ |
|
"epoch": 0.9161333782418323, |
|
"grad_norm": 0.5343140363693237, |
|
"learning_rate": 3.1705018432785673e-07, |
|
"loss": 0.2778, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.9168070057258336, |
|
"grad_norm": 0.5202133655548096, |
|
"learning_rate": 3.1199270253420397e-07, |
|
"loss": 0.262, |
|
"step": 1361 |
|
}, |
|
{ |
|
"epoch": 0.917480633209835, |
|
"grad_norm": 0.6042024493217468, |
|
"learning_rate": 3.069750264641369e-07, |
|
"loss": 0.3138, |
|
"step": 1362 |
|
}, |
|
{ |
|
"epoch": 0.9181542606938363, |
|
"grad_norm": 0.4585079550743103, |
|
"learning_rate": 3.0199718390452825e-07, |
|
"loss": 0.1988, |
|
"step": 1363 |
|
}, |
|
{ |
|
"epoch": 0.9188278881778377, |
|
"grad_norm": 0.5287424921989441, |
|
"learning_rate": 2.9705920242165565e-07, |
|
"loss": 0.2417, |
|
"step": 1364 |
|
}, |
|
{ |
|
"epoch": 0.919501515661839, |
|
"grad_norm": 0.5599202513694763, |
|
"learning_rate": 2.9216110936105906e-07, |
|
"loss": 0.2709, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.9201751431458404, |
|
"grad_norm": 0.5495063066482544, |
|
"learning_rate": 2.8730293184738105e-07, |
|
"loss": 0.2546, |
|
"step": 1366 |
|
}, |
|
{ |
|
"epoch": 0.9208487706298417, |
|
"grad_norm": 0.529340386390686, |
|
"learning_rate": 2.8248469678422346e-07, |
|
"loss": 0.2454, |
|
"step": 1367 |
|
}, |
|
{ |
|
"epoch": 0.9215223981138431, |
|
"grad_norm": 0.5447036027908325, |
|
"learning_rate": 2.7770643085399004e-07, |
|
"loss": 0.2703, |
|
"step": 1368 |
|
}, |
|
{ |
|
"epoch": 0.9221960255978444, |
|
"grad_norm": 0.5069358348846436, |
|
"learning_rate": 2.729681605177492e-07, |
|
"loss": 0.2529, |
|
"step": 1369 |
|
}, |
|
{ |
|
"epoch": 0.9228696530818458, |
|
"grad_norm": 0.5129917860031128, |
|
"learning_rate": 2.6826991201507724e-07, |
|
"loss": 0.2237, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.9235432805658471, |
|
"grad_norm": 0.5351532697677612, |
|
"learning_rate": 2.636117113639194e-07, |
|
"loss": 0.2592, |
|
"step": 1371 |
|
}, |
|
{ |
|
"epoch": 0.9242169080498485, |
|
"grad_norm": 0.5014567375183105, |
|
"learning_rate": 2.589935843604452e-07, |
|
"loss": 0.2112, |
|
"step": 1372 |
|
}, |
|
{ |
|
"epoch": 0.9248905355338498, |
|
"grad_norm": 0.5409959554672241, |
|
"learning_rate": 2.54415556578903e-07, |
|
"loss": 0.259, |
|
"step": 1373 |
|
}, |
|
{ |
|
"epoch": 0.9255641630178512, |
|
"grad_norm": 0.547943115234375, |
|
"learning_rate": 2.4987765337148e-07, |
|
"loss": 0.262, |
|
"step": 1374 |
|
}, |
|
{ |
|
"epoch": 0.9262377905018525, |
|
"grad_norm": 0.5011503100395203, |
|
"learning_rate": 2.453798998681625e-07, |
|
"loss": 0.2436, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.9269114179858539, |
|
"grad_norm": 0.5170352458953857, |
|
"learning_rate": 2.4092232097659486e-07, |
|
"loss": 0.2529, |
|
"step": 1376 |
|
}, |
|
{ |
|
"epoch": 0.9275850454698552, |
|
"grad_norm": 0.5595082640647888, |
|
"learning_rate": 2.3650494138194257e-07, |
|
"loss": 0.2843, |
|
"step": 1377 |
|
}, |
|
{ |
|
"epoch": 0.9282586729538566, |
|
"grad_norm": 0.5302537679672241, |
|
"learning_rate": 2.3212778554675766e-07, |
|
"loss": 0.2382, |
|
"step": 1378 |
|
}, |
|
{ |
|
"epoch": 0.9289323004378579, |
|
"grad_norm": 0.5083282589912415, |
|
"learning_rate": 2.277908777108387e-07, |
|
"loss": 0.2587, |
|
"step": 1379 |
|
}, |
|
{ |
|
"epoch": 0.9296059279218593, |
|
"grad_norm": 0.512374222278595, |
|
"learning_rate": 2.2349424189109984e-07, |
|
"loss": 0.2337, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.9302795554058606, |
|
"grad_norm": 0.5485064387321472, |
|
"learning_rate": 2.192379018814372e-07, |
|
"loss": 0.263, |
|
"step": 1381 |
|
}, |
|
{ |
|
"epoch": 0.930953182889862, |
|
"grad_norm": 0.5666427612304688, |
|
"learning_rate": 2.150218812525953e-07, |
|
"loss": 0.2683, |
|
"step": 1382 |
|
}, |
|
{ |
|
"epoch": 0.9316268103738633, |
|
"grad_norm": 0.5534345507621765, |
|
"learning_rate": 2.1084620335204225e-07, |
|
"loss": 0.2069, |
|
"step": 1383 |
|
}, |
|
{ |
|
"epoch": 0.9323004378578647, |
|
"grad_norm": 0.5635040402412415, |
|
"learning_rate": 2.0671089130383152e-07, |
|
"loss": 0.3081, |
|
"step": 1384 |
|
}, |
|
{ |
|
"epoch": 0.932974065341866, |
|
"grad_norm": 0.5579578280448914, |
|
"learning_rate": 2.0261596800848132e-07, |
|
"loss": 0.2694, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.9336476928258673, |
|
"grad_norm": 0.5865366458892822, |
|
"learning_rate": 1.9856145614284616e-07, |
|
"loss": 0.261, |
|
"step": 1386 |
|
}, |
|
{ |
|
"epoch": 0.9343213203098687, |
|
"grad_norm": 0.5374801754951477, |
|
"learning_rate": 1.9454737815998546e-07, |
|
"loss": 0.258, |
|
"step": 1387 |
|
}, |
|
{ |
|
"epoch": 0.9349949477938699, |
|
"grad_norm": 0.4848352372646332, |
|
"learning_rate": 1.9057375628905112e-07, |
|
"loss": 0.2121, |
|
"step": 1388 |
|
}, |
|
{ |
|
"epoch": 0.9356685752778713, |
|
"grad_norm": 0.5470321774482727, |
|
"learning_rate": 1.8664061253514997e-07, |
|
"loss": 0.2556, |
|
"step": 1389 |
|
}, |
|
{ |
|
"epoch": 0.9363422027618726, |
|
"grad_norm": 0.4898010194301605, |
|
"learning_rate": 1.8274796867923578e-07, |
|
"loss": 0.2436, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.937015830245874, |
|
"grad_norm": 0.5198950171470642, |
|
"learning_rate": 1.788958462779766e-07, |
|
"loss": 0.2523, |
|
"step": 1391 |
|
}, |
|
{ |
|
"epoch": 0.9376894577298753, |
|
"grad_norm": 0.5553786754608154, |
|
"learning_rate": 1.750842666636443e-07, |
|
"loss": 0.2764, |
|
"step": 1392 |
|
}, |
|
{ |
|
"epoch": 0.9383630852138767, |
|
"grad_norm": 0.5620520114898682, |
|
"learning_rate": 1.7131325094399352e-07, |
|
"loss": 0.249, |
|
"step": 1393 |
|
}, |
|
{ |
|
"epoch": 0.939036712697878, |
|
"grad_norm": 0.5561796426773071, |
|
"learning_rate": 1.6758282000214202e-07, |
|
"loss": 0.2581, |
|
"step": 1394 |
|
}, |
|
{ |
|
"epoch": 0.9397103401818794, |
|
"grad_norm": 0.5772345066070557, |
|
"learning_rate": 1.6389299449645734e-07, |
|
"loss": 0.2718, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.9403839676658807, |
|
"grad_norm": 0.5675050020217896, |
|
"learning_rate": 1.6024379486044517e-07, |
|
"loss": 0.2668, |
|
"step": 1396 |
|
}, |
|
{ |
|
"epoch": 0.9410575951498821, |
|
"grad_norm": 0.517271876335144, |
|
"learning_rate": 1.5663524130262867e-07, |
|
"loss": 0.2287, |
|
"step": 1397 |
|
}, |
|
{ |
|
"epoch": 0.9417312226338834, |
|
"grad_norm": 0.5633882284164429, |
|
"learning_rate": 1.5306735380644698e-07, |
|
"loss": 0.2676, |
|
"step": 1398 |
|
}, |
|
{ |
|
"epoch": 0.9424048501178848, |
|
"grad_norm": 0.5311648845672607, |
|
"learning_rate": 1.4954015213013427e-07, |
|
"loss": 0.2269, |
|
"step": 1399 |
|
}, |
|
{ |
|
"epoch": 0.9430784776018861, |
|
"grad_norm": 0.4805348813533783, |
|
"learning_rate": 1.4605365580661668e-07, |
|
"loss": 0.2116, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9430784776018861, |
|
"eval_loss": 0.24632865190505981, |
|
"eval_runtime": 107.1175, |
|
"eval_samples_per_second": 46.678, |
|
"eval_steps_per_second": 2.922, |
|
"step": 1400 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1484, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.577960594384093e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|