szymonrucinski commited on
Commit
3c6a2d8
1 Parent(s): 39914ce

Update to 3rd epoch (#3)

Browse files

- Update to 3rd epoch (d0952bd6981f1ea64fbf5c45bfac4813048441c2)

Files changed (5) hide show
  1. adapter_model.bin +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +3 -1203
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f60ab1968e3383e1d0d99a3068cc74957a84c043bf5eda03d22efd0f8ea9f8f9
3
  size 134263757
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e74d42214f527d418c0910ec128f6d3b5f1523570670c171baf66d4a0b95cefb
3
  size 134263757
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81e5d1b0fcd6e6a2841ac0a59753960210a56af903a223d6b0a46462ec2c9c96
3
  size 268514565
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25a3db7ff130b07d4e4b23a83e562d5a4ef3cf845f13e971f940342e4037d75b
3
  size 268514565
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:757b82c50c74e12673c2fa57a7a74b1902252dcab25b574e433c6ead4b7d6d80
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:846c8f223a95d3e869043e3bb0dfb8a2e257ae81486c479822c9e12ad0bacd7c
3
  size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48d79510a393105d03ace4b25923bb16766b9918be883f120389bf31e783c069
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb57538b0fddb988f32d5e33311bcf25efee1aa4001ec3e33ef4a2dd884d77d3
3
  size 627
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 7.230657989877079,
5
- "global_step": 5000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -1806,1211 +1806,11 @@
1806
  "learning_rate": 0.0002,
1807
  "loss": 0.8549,
1808
  "step": 3000
1809
- },
1810
- {
1811
- "epoch": 4.35,
1812
- "learning_rate": 0.0002,
1813
- "loss": 0.8248,
1814
- "step": 3010
1815
- },
1816
- {
1817
- "epoch": 4.37,
1818
- "learning_rate": 0.0002,
1819
- "loss": 0.8262,
1820
- "step": 3020
1821
- },
1822
- {
1823
- "epoch": 4.38,
1824
- "learning_rate": 0.0002,
1825
- "loss": 0.9076,
1826
- "step": 3030
1827
- },
1828
- {
1829
- "epoch": 4.4,
1830
- "learning_rate": 0.0002,
1831
- "loss": 0.8706,
1832
- "step": 3040
1833
- },
1834
- {
1835
- "epoch": 4.41,
1836
- "learning_rate": 0.0002,
1837
- "loss": 0.8503,
1838
- "step": 3050
1839
- },
1840
- {
1841
- "epoch": 4.43,
1842
- "learning_rate": 0.0002,
1843
- "loss": 0.8351,
1844
- "step": 3060
1845
- },
1846
- {
1847
- "epoch": 4.44,
1848
- "learning_rate": 0.0002,
1849
- "loss": 0.8267,
1850
- "step": 3070
1851
- },
1852
- {
1853
- "epoch": 4.45,
1854
- "learning_rate": 0.0002,
1855
- "loss": 0.9008,
1856
- "step": 3080
1857
- },
1858
- {
1859
- "epoch": 4.47,
1860
- "learning_rate": 0.0002,
1861
- "loss": 0.8624,
1862
- "step": 3090
1863
- },
1864
- {
1865
- "epoch": 4.48,
1866
- "learning_rate": 0.0002,
1867
- "loss": 0.8703,
1868
- "step": 3100
1869
- },
1870
- {
1871
- "epoch": 4.5,
1872
- "learning_rate": 0.0002,
1873
- "loss": 0.8284,
1874
- "step": 3110
1875
- },
1876
- {
1877
- "epoch": 4.51,
1878
- "learning_rate": 0.0002,
1879
- "loss": 0.7899,
1880
- "step": 3120
1881
- },
1882
- {
1883
- "epoch": 4.53,
1884
- "learning_rate": 0.0002,
1885
- "loss": 0.9195,
1886
- "step": 3130
1887
- },
1888
- {
1889
- "epoch": 4.54,
1890
- "learning_rate": 0.0002,
1891
- "loss": 0.881,
1892
- "step": 3140
1893
- },
1894
- {
1895
- "epoch": 4.56,
1896
- "learning_rate": 0.0002,
1897
- "loss": 0.8647,
1898
- "step": 3150
1899
- },
1900
- {
1901
- "epoch": 4.57,
1902
- "learning_rate": 0.0002,
1903
- "loss": 0.8115,
1904
- "step": 3160
1905
- },
1906
- {
1907
- "epoch": 4.58,
1908
- "learning_rate": 0.0002,
1909
- "loss": 0.8282,
1910
- "step": 3170
1911
- },
1912
- {
1913
- "epoch": 4.6,
1914
- "learning_rate": 0.0002,
1915
- "loss": 0.9087,
1916
- "step": 3180
1917
- },
1918
- {
1919
- "epoch": 4.61,
1920
- "learning_rate": 0.0002,
1921
- "loss": 0.8789,
1922
- "step": 3190
1923
- },
1924
- {
1925
- "epoch": 4.63,
1926
- "learning_rate": 0.0002,
1927
- "loss": 0.8359,
1928
- "step": 3200
1929
- },
1930
- {
1931
- "epoch": 4.64,
1932
- "learning_rate": 0.0002,
1933
- "loss": 0.8223,
1934
- "step": 3210
1935
- },
1936
- {
1937
- "epoch": 4.66,
1938
- "learning_rate": 0.0002,
1939
- "loss": 0.8178,
1940
- "step": 3220
1941
- },
1942
- {
1943
- "epoch": 4.67,
1944
- "learning_rate": 0.0002,
1945
- "loss": 0.9243,
1946
- "step": 3230
1947
- },
1948
- {
1949
- "epoch": 4.69,
1950
- "learning_rate": 0.0002,
1951
- "loss": 0.8732,
1952
- "step": 3240
1953
- },
1954
- {
1955
- "epoch": 4.7,
1956
- "learning_rate": 0.0002,
1957
- "loss": 0.8427,
1958
- "step": 3250
1959
- },
1960
- {
1961
- "epoch": 4.71,
1962
- "learning_rate": 0.0002,
1963
- "loss": 0.8358,
1964
- "step": 3260
1965
- },
1966
- {
1967
- "epoch": 4.73,
1968
- "learning_rate": 0.0002,
1969
- "loss": 0.8308,
1970
- "step": 3270
1971
- },
1972
- {
1973
- "epoch": 4.74,
1974
- "learning_rate": 0.0002,
1975
- "loss": 0.9175,
1976
- "step": 3280
1977
- },
1978
- {
1979
- "epoch": 4.76,
1980
- "learning_rate": 0.0002,
1981
- "loss": 0.8806,
1982
- "step": 3290
1983
- },
1984
- {
1985
- "epoch": 4.77,
1986
- "learning_rate": 0.0002,
1987
- "loss": 0.848,
1988
- "step": 3300
1989
- },
1990
- {
1991
- "epoch": 4.79,
1992
- "learning_rate": 0.0002,
1993
- "loss": 0.8159,
1994
- "step": 3310
1995
- },
1996
- {
1997
- "epoch": 4.8,
1998
- "learning_rate": 0.0002,
1999
- "loss": 0.8202,
2000
- "step": 3320
2001
- },
2002
- {
2003
- "epoch": 4.82,
2004
- "learning_rate": 0.0002,
2005
- "loss": 0.9166,
2006
- "step": 3330
2007
- },
2008
- {
2009
- "epoch": 4.83,
2010
- "learning_rate": 0.0002,
2011
- "loss": 0.8756,
2012
- "step": 3340
2013
- },
2014
- {
2015
- "epoch": 4.84,
2016
- "learning_rate": 0.0002,
2017
- "loss": 0.842,
2018
- "step": 3350
2019
- },
2020
- {
2021
- "epoch": 4.86,
2022
- "learning_rate": 0.0002,
2023
- "loss": 0.831,
2024
- "step": 3360
2025
- },
2026
- {
2027
- "epoch": 4.87,
2028
- "learning_rate": 0.0002,
2029
- "loss": 0.8228,
2030
- "step": 3370
2031
- },
2032
- {
2033
- "epoch": 4.89,
2034
- "learning_rate": 0.0002,
2035
- "loss": 0.9109,
2036
- "step": 3380
2037
- },
2038
- {
2039
- "epoch": 4.9,
2040
- "learning_rate": 0.0002,
2041
- "loss": 0.8746,
2042
- "step": 3390
2043
- },
2044
- {
2045
- "epoch": 4.92,
2046
- "learning_rate": 0.0002,
2047
- "loss": 0.8605,
2048
- "step": 3400
2049
- },
2050
- {
2051
- "epoch": 4.93,
2052
- "learning_rate": 0.0002,
2053
- "loss": 0.8263,
2054
- "step": 3410
2055
- },
2056
- {
2057
- "epoch": 4.95,
2058
- "learning_rate": 0.0002,
2059
- "loss": 0.8009,
2060
- "step": 3420
2061
- },
2062
- {
2063
- "epoch": 4.96,
2064
- "learning_rate": 0.0002,
2065
- "loss": 0.9056,
2066
- "step": 3430
2067
- },
2068
- {
2069
- "epoch": 4.97,
2070
- "learning_rate": 0.0002,
2071
- "loss": 0.8647,
2072
- "step": 3440
2073
- },
2074
- {
2075
- "epoch": 4.99,
2076
- "learning_rate": 0.0002,
2077
- "loss": 0.8481,
2078
- "step": 3450
2079
- },
2080
- {
2081
- "epoch": 5.0,
2082
- "learning_rate": 0.0002,
2083
- "loss": 0.7768,
2084
- "step": 3460
2085
- },
2086
- {
2087
- "epoch": 5.02,
2088
- "learning_rate": 0.0002,
2089
- "loss": 0.8872,
2090
- "step": 3470
2091
- },
2092
- {
2093
- "epoch": 5.03,
2094
- "learning_rate": 0.0002,
2095
- "loss": 0.8517,
2096
- "step": 3480
2097
- },
2098
- {
2099
- "epoch": 5.05,
2100
- "learning_rate": 0.0002,
2101
- "loss": 0.8096,
2102
- "step": 3490
2103
- },
2104
- {
2105
- "epoch": 5.06,
2106
- "learning_rate": 0.0002,
2107
- "loss": 0.7853,
2108
- "step": 3500
2109
- },
2110
- {
2111
- "epoch": 5.08,
2112
- "learning_rate": 0.0002,
2113
- "loss": 0.7469,
2114
- "step": 3510
2115
- },
2116
- {
2117
- "epoch": 5.09,
2118
- "learning_rate": 0.0002,
2119
- "loss": 0.9045,
2120
- "step": 3520
2121
- },
2122
- {
2123
- "epoch": 5.1,
2124
- "learning_rate": 0.0002,
2125
- "loss": 0.8387,
2126
- "step": 3530
2127
- },
2128
- {
2129
- "epoch": 5.12,
2130
- "learning_rate": 0.0002,
2131
- "loss": 0.8039,
2132
- "step": 3540
2133
- },
2134
- {
2135
- "epoch": 5.13,
2136
- "learning_rate": 0.0002,
2137
- "loss": 0.7629,
2138
- "step": 3550
2139
- },
2140
- {
2141
- "epoch": 5.15,
2142
- "learning_rate": 0.0002,
2143
- "loss": 0.7336,
2144
- "step": 3560
2145
- },
2146
- {
2147
- "epoch": 5.16,
2148
- "learning_rate": 0.0002,
2149
- "loss": 0.9018,
2150
- "step": 3570
2151
- },
2152
- {
2153
- "epoch": 5.18,
2154
- "learning_rate": 0.0002,
2155
- "loss": 0.8511,
2156
- "step": 3580
2157
- },
2158
- {
2159
- "epoch": 5.19,
2160
- "learning_rate": 0.0002,
2161
- "loss": 0.8043,
2162
- "step": 3590
2163
- },
2164
- {
2165
- "epoch": 5.21,
2166
- "learning_rate": 0.0002,
2167
- "loss": 0.7877,
2168
- "step": 3600
2169
- },
2170
- {
2171
- "epoch": 5.22,
2172
- "learning_rate": 0.0002,
2173
- "loss": 0.7438,
2174
- "step": 3610
2175
- },
2176
- {
2177
- "epoch": 5.23,
2178
- "learning_rate": 0.0002,
2179
- "loss": 0.903,
2180
- "step": 3620
2181
- },
2182
- {
2183
- "epoch": 5.25,
2184
- "learning_rate": 0.0002,
2185
- "loss": 0.8562,
2186
- "step": 3630
2187
- },
2188
- {
2189
- "epoch": 5.26,
2190
- "learning_rate": 0.0002,
2191
- "loss": 0.8198,
2192
- "step": 3640
2193
- },
2194
- {
2195
- "epoch": 5.28,
2196
- "learning_rate": 0.0002,
2197
- "loss": 0.7953,
2198
- "step": 3650
2199
- },
2200
- {
2201
- "epoch": 5.29,
2202
- "learning_rate": 0.0002,
2203
- "loss": 0.7606,
2204
- "step": 3660
2205
- },
2206
- {
2207
- "epoch": 5.31,
2208
- "learning_rate": 0.0002,
2209
- "loss": 0.8996,
2210
- "step": 3670
2211
- },
2212
- {
2213
- "epoch": 5.32,
2214
- "learning_rate": 0.0002,
2215
- "loss": 0.8519,
2216
- "step": 3680
2217
- },
2218
- {
2219
- "epoch": 5.34,
2220
- "learning_rate": 0.0002,
2221
- "loss": 0.8307,
2222
- "step": 3690
2223
- },
2224
- {
2225
- "epoch": 5.35,
2226
- "learning_rate": 0.0002,
2227
- "loss": 0.7875,
2228
- "step": 3700
2229
- },
2230
- {
2231
- "epoch": 5.37,
2232
- "learning_rate": 0.0002,
2233
- "loss": 0.7508,
2234
- "step": 3710
2235
- },
2236
- {
2237
- "epoch": 5.38,
2238
- "learning_rate": 0.0002,
2239
- "loss": 0.9087,
2240
- "step": 3720
2241
- },
2242
- {
2243
- "epoch": 5.39,
2244
- "learning_rate": 0.0002,
2245
- "loss": 0.8564,
2246
- "step": 3730
2247
- },
2248
- {
2249
- "epoch": 5.41,
2250
- "learning_rate": 0.0002,
2251
- "loss": 0.8265,
2252
- "step": 3740
2253
- },
2254
- {
2255
- "epoch": 5.42,
2256
- "learning_rate": 0.0002,
2257
- "loss": 0.794,
2258
- "step": 3750
2259
- },
2260
- {
2261
- "epoch": 5.44,
2262
- "learning_rate": 0.0002,
2263
- "loss": 0.7502,
2264
- "step": 3760
2265
- },
2266
- {
2267
- "epoch": 5.45,
2268
- "learning_rate": 0.0002,
2269
- "loss": 0.8966,
2270
- "step": 3770
2271
- },
2272
- {
2273
- "epoch": 5.47,
2274
- "learning_rate": 0.0002,
2275
- "loss": 0.8521,
2276
- "step": 3780
2277
- },
2278
- {
2279
- "epoch": 5.48,
2280
- "learning_rate": 0.0002,
2281
- "loss": 0.8195,
2282
- "step": 3790
2283
- },
2284
- {
2285
- "epoch": 5.5,
2286
- "learning_rate": 0.0002,
2287
- "loss": 0.7985,
2288
- "step": 3800
2289
- },
2290
- {
2291
- "epoch": 5.51,
2292
- "learning_rate": 0.0002,
2293
- "loss": 0.7676,
2294
- "step": 3810
2295
- },
2296
- {
2297
- "epoch": 5.52,
2298
- "learning_rate": 0.0002,
2299
- "loss": 0.9075,
2300
- "step": 3820
2301
- },
2302
- {
2303
- "epoch": 5.54,
2304
- "learning_rate": 0.0002,
2305
- "loss": 0.852,
2306
- "step": 3830
2307
- },
2308
- {
2309
- "epoch": 5.55,
2310
- "learning_rate": 0.0002,
2311
- "loss": 0.8306,
2312
- "step": 3840
2313
- },
2314
- {
2315
- "epoch": 5.57,
2316
- "learning_rate": 0.0002,
2317
- "loss": 0.7964,
2318
- "step": 3850
2319
- },
2320
- {
2321
- "epoch": 5.58,
2322
- "learning_rate": 0.0002,
2323
- "loss": 0.7599,
2324
- "step": 3860
2325
- },
2326
- {
2327
- "epoch": 5.6,
2328
- "learning_rate": 0.0002,
2329
- "loss": 0.8959,
2330
- "step": 3870
2331
- },
2332
- {
2333
- "epoch": 5.61,
2334
- "learning_rate": 0.0002,
2335
- "loss": 0.8489,
2336
- "step": 3880
2337
- },
2338
- {
2339
- "epoch": 5.63,
2340
- "learning_rate": 0.0002,
2341
- "loss": 0.8095,
2342
- "step": 3890
2343
- },
2344
- {
2345
- "epoch": 5.64,
2346
- "learning_rate": 0.0002,
2347
- "loss": 0.7912,
2348
- "step": 3900
2349
- },
2350
- {
2351
- "epoch": 5.65,
2352
- "learning_rate": 0.0002,
2353
- "loss": 0.7555,
2354
- "step": 3910
2355
- },
2356
- {
2357
- "epoch": 5.67,
2358
- "learning_rate": 0.0002,
2359
- "loss": 0.8884,
2360
- "step": 3920
2361
- },
2362
- {
2363
- "epoch": 5.68,
2364
- "learning_rate": 0.0002,
2365
- "loss": 0.8375,
2366
- "step": 3930
2367
- },
2368
- {
2369
- "epoch": 5.7,
2370
- "learning_rate": 0.0002,
2371
- "loss": 0.8313,
2372
- "step": 3940
2373
- },
2374
- {
2375
- "epoch": 5.71,
2376
- "learning_rate": 0.0002,
2377
- "loss": 0.7983,
2378
- "step": 3950
2379
- },
2380
- {
2381
- "epoch": 5.73,
2382
- "learning_rate": 0.0002,
2383
- "loss": 0.7691,
2384
- "step": 3960
2385
- },
2386
- {
2387
- "epoch": 5.74,
2388
- "learning_rate": 0.0002,
2389
- "loss": 0.8946,
2390
- "step": 3970
2391
- },
2392
- {
2393
- "epoch": 5.76,
2394
- "learning_rate": 0.0002,
2395
- "loss": 0.8528,
2396
- "step": 3980
2397
- },
2398
- {
2399
- "epoch": 5.77,
2400
- "learning_rate": 0.0002,
2401
- "loss": 0.8293,
2402
- "step": 3990
2403
- },
2404
- {
2405
- "epoch": 5.78,
2406
- "learning_rate": 0.0002,
2407
- "loss": 0.7929,
2408
- "step": 4000
2409
- },
2410
- {
2411
- "epoch": 5.8,
2412
- "learning_rate": 0.0002,
2413
- "loss": 0.773,
2414
- "step": 4010
2415
- },
2416
- {
2417
- "epoch": 5.81,
2418
- "learning_rate": 0.0002,
2419
- "loss": 0.8988,
2420
- "step": 4020
2421
- },
2422
- {
2423
- "epoch": 5.83,
2424
- "learning_rate": 0.0002,
2425
- "loss": 0.8639,
2426
- "step": 4030
2427
- },
2428
- {
2429
- "epoch": 5.84,
2430
- "learning_rate": 0.0002,
2431
- "loss": 0.8219,
2432
- "step": 4040
2433
- },
2434
- {
2435
- "epoch": 5.86,
2436
- "learning_rate": 0.0002,
2437
- "loss": 0.8106,
2438
- "step": 4050
2439
- },
2440
- {
2441
- "epoch": 5.87,
2442
- "learning_rate": 0.0002,
2443
- "loss": 0.7746,
2444
- "step": 4060
2445
- },
2446
- {
2447
- "epoch": 5.89,
2448
- "learning_rate": 0.0002,
2449
- "loss": 0.8874,
2450
- "step": 4070
2451
- },
2452
- {
2453
- "epoch": 5.9,
2454
- "learning_rate": 0.0002,
2455
- "loss": 0.8497,
2456
- "step": 4080
2457
- },
2458
- {
2459
- "epoch": 5.91,
2460
- "learning_rate": 0.0002,
2461
- "loss": 0.818,
2462
- "step": 4090
2463
- },
2464
- {
2465
- "epoch": 5.93,
2466
- "learning_rate": 0.0002,
2467
- "loss": 0.7891,
2468
- "step": 4100
2469
- },
2470
- {
2471
- "epoch": 5.94,
2472
- "learning_rate": 0.0002,
2473
- "loss": 0.7508,
2474
- "step": 4110
2475
- },
2476
- {
2477
- "epoch": 5.96,
2478
- "learning_rate": 0.0002,
2479
- "loss": 0.8985,
2480
- "step": 4120
2481
- },
2482
- {
2483
- "epoch": 5.97,
2484
- "learning_rate": 0.0002,
2485
- "loss": 0.8496,
2486
- "step": 4130
2487
- },
2488
- {
2489
- "epoch": 5.99,
2490
- "learning_rate": 0.0002,
2491
- "loss": 0.8176,
2492
- "step": 4140
2493
- },
2494
- {
2495
- "epoch": 6.0,
2496
- "learning_rate": 0.0002,
2497
- "loss": 0.7464,
2498
- "step": 4150
2499
- },
2500
- {
2501
- "epoch": 6.02,
2502
- "learning_rate": 0.0002,
2503
- "loss": 0.8725,
2504
- "step": 4160
2505
- },
2506
- {
2507
- "epoch": 6.03,
2508
- "learning_rate": 0.0002,
2509
- "loss": 0.8238,
2510
- "step": 4170
2511
- },
2512
- {
2513
- "epoch": 6.04,
2514
- "learning_rate": 0.0002,
2515
- "loss": 0.7955,
2516
- "step": 4180
2517
- },
2518
- {
2519
- "epoch": 6.06,
2520
- "learning_rate": 0.0002,
2521
- "loss": 0.7304,
2522
- "step": 4190
2523
- },
2524
- {
2525
- "epoch": 6.07,
2526
- "learning_rate": 0.0002,
2527
- "loss": 0.6796,
2528
- "step": 4200
2529
- },
2530
- {
2531
- "epoch": 6.09,
2532
- "learning_rate": 0.0002,
2533
- "loss": 0.887,
2534
- "step": 4210
2535
- },
2536
- {
2537
- "epoch": 6.1,
2538
- "learning_rate": 0.0002,
2539
- "loss": 0.831,
2540
- "step": 4220
2541
- },
2542
- {
2543
- "epoch": 6.12,
2544
- "learning_rate": 0.0002,
2545
- "loss": 0.7878,
2546
- "step": 4230
2547
- },
2548
- {
2549
- "epoch": 6.13,
2550
- "learning_rate": 0.0002,
2551
- "loss": 0.7574,
2552
- "step": 4240
2553
- },
2554
- {
2555
- "epoch": 6.15,
2556
- "learning_rate": 0.0002,
2557
- "loss": 0.6826,
2558
- "step": 4250
2559
- },
2560
- {
2561
- "epoch": 6.16,
2562
- "learning_rate": 0.0002,
2563
- "loss": 0.881,
2564
- "step": 4260
2565
- },
2566
- {
2567
- "epoch": 6.17,
2568
- "learning_rate": 0.0002,
2569
- "loss": 0.8245,
2570
- "step": 4270
2571
- },
2572
- {
2573
- "epoch": 6.19,
2574
- "learning_rate": 0.0002,
2575
- "loss": 0.7827,
2576
- "step": 4280
2577
- },
2578
- {
2579
- "epoch": 6.2,
2580
- "learning_rate": 0.0002,
2581
- "loss": 0.7556,
2582
- "step": 4290
2583
- },
2584
- {
2585
- "epoch": 6.22,
2586
- "learning_rate": 0.0002,
2587
- "loss": 0.6812,
2588
- "step": 4300
2589
- },
2590
- {
2591
- "epoch": 6.23,
2592
- "learning_rate": 0.0002,
2593
- "loss": 0.8812,
2594
- "step": 4310
2595
- },
2596
- {
2597
- "epoch": 6.25,
2598
- "learning_rate": 0.0002,
2599
- "loss": 0.8373,
2600
- "step": 4320
2601
- },
2602
- {
2603
- "epoch": 6.26,
2604
- "learning_rate": 0.0002,
2605
- "loss": 0.7915,
2606
- "step": 4330
2607
- },
2608
- {
2609
- "epoch": 6.28,
2610
- "learning_rate": 0.0002,
2611
- "loss": 0.7575,
2612
- "step": 4340
2613
- },
2614
- {
2615
- "epoch": 6.29,
2616
- "learning_rate": 0.0002,
2617
- "loss": 0.6715,
2618
- "step": 4350
2619
- },
2620
- {
2621
- "epoch": 6.31,
2622
- "learning_rate": 0.0002,
2623
- "loss": 0.8731,
2624
- "step": 4360
2625
- },
2626
- {
2627
- "epoch": 6.32,
2628
- "learning_rate": 0.0002,
2629
- "loss": 0.8362,
2630
- "step": 4370
2631
- },
2632
- {
2633
- "epoch": 6.33,
2634
- "learning_rate": 0.0002,
2635
- "loss": 0.7917,
2636
- "step": 4380
2637
- },
2638
- {
2639
- "epoch": 6.35,
2640
- "learning_rate": 0.0002,
2641
- "loss": 0.7803,
2642
- "step": 4390
2643
- },
2644
- {
2645
- "epoch": 6.36,
2646
- "learning_rate": 0.0002,
2647
- "loss": 0.6743,
2648
- "step": 4400
2649
- },
2650
- {
2651
- "epoch": 6.38,
2652
- "learning_rate": 0.0002,
2653
- "loss": 0.8894,
2654
- "step": 4410
2655
- },
2656
- {
2657
- "epoch": 6.39,
2658
- "learning_rate": 0.0002,
2659
- "loss": 0.8375,
2660
- "step": 4420
2661
- },
2662
- {
2663
- "epoch": 6.41,
2664
- "learning_rate": 0.0002,
2665
- "loss": 0.793,
2666
- "step": 4430
2667
- },
2668
- {
2669
- "epoch": 6.42,
2670
- "learning_rate": 0.0002,
2671
- "loss": 0.7616,
2672
- "step": 4440
2673
- },
2674
- {
2675
- "epoch": 6.44,
2676
- "learning_rate": 0.0002,
2677
- "loss": 0.6821,
2678
- "step": 4450
2679
- },
2680
- {
2681
- "epoch": 6.45,
2682
- "learning_rate": 0.0002,
2683
- "loss": 0.884,
2684
- "step": 4460
2685
- },
2686
- {
2687
- "epoch": 6.46,
2688
- "learning_rate": 0.0002,
2689
- "loss": 0.8264,
2690
- "step": 4470
2691
- },
2692
- {
2693
- "epoch": 6.48,
2694
- "learning_rate": 0.0002,
2695
- "loss": 0.7921,
2696
- "step": 4480
2697
- },
2698
- {
2699
- "epoch": 6.49,
2700
- "learning_rate": 0.0002,
2701
- "loss": 0.7699,
2702
- "step": 4490
2703
- },
2704
- {
2705
- "epoch": 6.51,
2706
- "learning_rate": 0.0002,
2707
- "loss": 0.6964,
2708
- "step": 4500
2709
- },
2710
- {
2711
- "epoch": 6.52,
2712
- "learning_rate": 0.0002,
2713
- "loss": 0.8871,
2714
- "step": 4510
2715
- },
2716
- {
2717
- "epoch": 6.54,
2718
- "learning_rate": 0.0002,
2719
- "loss": 0.8286,
2720
- "step": 4520
2721
- },
2722
- {
2723
- "epoch": 6.55,
2724
- "learning_rate": 0.0002,
2725
- "loss": 0.792,
2726
- "step": 4530
2727
- },
2728
- {
2729
- "epoch": 6.57,
2730
- "learning_rate": 0.0002,
2731
- "loss": 0.7692,
2732
- "step": 4540
2733
- },
2734
- {
2735
- "epoch": 6.58,
2736
- "learning_rate": 0.0002,
2737
- "loss": 0.6903,
2738
- "step": 4550
2739
- },
2740
- {
2741
- "epoch": 6.59,
2742
- "learning_rate": 0.0002,
2743
- "loss": 0.8976,
2744
- "step": 4560
2745
- },
2746
- {
2747
- "epoch": 6.61,
2748
- "learning_rate": 0.0002,
2749
- "loss": 0.8361,
2750
- "step": 4570
2751
- },
2752
- {
2753
- "epoch": 6.62,
2754
- "learning_rate": 0.0002,
2755
- "loss": 0.7947,
2756
- "step": 4580
2757
- },
2758
- {
2759
- "epoch": 6.64,
2760
- "learning_rate": 0.0002,
2761
- "loss": 0.7715,
2762
- "step": 4590
2763
- },
2764
- {
2765
- "epoch": 6.65,
2766
- "learning_rate": 0.0002,
2767
- "loss": 0.7046,
2768
- "step": 4600
2769
- },
2770
- {
2771
- "epoch": 6.67,
2772
- "learning_rate": 0.0002,
2773
- "loss": 0.8952,
2774
- "step": 4610
2775
- },
2776
- {
2777
- "epoch": 6.68,
2778
- "learning_rate": 0.0002,
2779
- "loss": 0.848,
2780
- "step": 4620
2781
- },
2782
- {
2783
- "epoch": 6.7,
2784
- "learning_rate": 0.0002,
2785
- "loss": 0.7885,
2786
- "step": 4630
2787
- },
2788
- {
2789
- "epoch": 6.71,
2790
- "learning_rate": 0.0002,
2791
- "loss": 0.7753,
2792
- "step": 4640
2793
- },
2794
- {
2795
- "epoch": 6.72,
2796
- "learning_rate": 0.0002,
2797
- "loss": 0.7011,
2798
- "step": 4650
2799
- },
2800
- {
2801
- "epoch": 6.74,
2802
- "learning_rate": 0.0002,
2803
- "loss": 0.8948,
2804
- "step": 4660
2805
- },
2806
- {
2807
- "epoch": 6.75,
2808
- "learning_rate": 0.0002,
2809
- "loss": 0.8186,
2810
- "step": 4670
2811
- },
2812
- {
2813
- "epoch": 6.77,
2814
- "learning_rate": 0.0002,
2815
- "loss": 0.7991,
2816
- "step": 4680
2817
- },
2818
- {
2819
- "epoch": 6.78,
2820
- "learning_rate": 0.0002,
2821
- "loss": 0.756,
2822
- "step": 4690
2823
- },
2824
- {
2825
- "epoch": 6.8,
2826
- "learning_rate": 0.0002,
2827
- "loss": 0.6999,
2828
- "step": 4700
2829
- },
2830
- {
2831
- "epoch": 6.81,
2832
- "learning_rate": 0.0002,
2833
- "loss": 0.8991,
2834
- "step": 4710
2835
- },
2836
- {
2837
- "epoch": 6.83,
2838
- "learning_rate": 0.0002,
2839
- "loss": 0.8334,
2840
- "step": 4720
2841
- },
2842
- {
2843
- "epoch": 6.84,
2844
- "learning_rate": 0.0002,
2845
- "loss": 0.7978,
2846
- "step": 4730
2847
- },
2848
- {
2849
- "epoch": 6.85,
2850
- "learning_rate": 0.0002,
2851
- "loss": 0.7514,
2852
- "step": 4740
2853
- },
2854
- {
2855
- "epoch": 6.87,
2856
- "learning_rate": 0.0002,
2857
- "loss": 0.6968,
2858
- "step": 4750
2859
- },
2860
- {
2861
- "epoch": 6.88,
2862
- "learning_rate": 0.0002,
2863
- "loss": 0.8783,
2864
- "step": 4760
2865
- },
2866
- {
2867
- "epoch": 6.9,
2868
- "learning_rate": 0.0002,
2869
- "loss": 0.8291,
2870
- "step": 4770
2871
- },
2872
- {
2873
- "epoch": 6.91,
2874
- "learning_rate": 0.0002,
2875
- "loss": 0.8059,
2876
- "step": 4780
2877
- },
2878
- {
2879
- "epoch": 6.93,
2880
- "learning_rate": 0.0002,
2881
- "loss": 0.7764,
2882
- "step": 4790
2883
- },
2884
- {
2885
- "epoch": 6.94,
2886
- "learning_rate": 0.0002,
2887
- "loss": 0.6978,
2888
- "step": 4800
2889
- },
2890
- {
2891
- "epoch": 6.96,
2892
- "learning_rate": 0.0002,
2893
- "loss": 0.8875,
2894
- "step": 4810
2895
- },
2896
- {
2897
- "epoch": 6.97,
2898
- "learning_rate": 0.0002,
2899
- "loss": 0.8238,
2900
- "step": 4820
2901
- },
2902
- {
2903
- "epoch": 6.98,
2904
- "learning_rate": 0.0002,
2905
- "loss": 0.7776,
2906
- "step": 4830
2907
- },
2908
- {
2909
- "epoch": 7.0,
2910
- "learning_rate": 0.0002,
2911
- "loss": 0.7001,
2912
- "step": 4840
2913
- },
2914
- {
2915
- "epoch": 7.01,
2916
- "learning_rate": 0.0002,
2917
- "loss": 0.8558,
2918
- "step": 4850
2919
- },
2920
- {
2921
- "epoch": 7.03,
2922
- "learning_rate": 0.0002,
2923
- "loss": 0.8014,
2924
- "step": 4860
2925
- },
2926
- {
2927
- "epoch": 7.04,
2928
- "learning_rate": 0.0002,
2929
- "loss": 0.7671,
2930
- "step": 4870
2931
- },
2932
- {
2933
- "epoch": 7.06,
2934
- "learning_rate": 0.0002,
2935
- "loss": 0.7089,
2936
- "step": 4880
2937
- },
2938
- {
2939
- "epoch": 7.07,
2940
- "learning_rate": 0.0002,
2941
- "loss": 0.6218,
2942
- "step": 4890
2943
- },
2944
- {
2945
- "epoch": 7.09,
2946
- "learning_rate": 0.0002,
2947
- "loss": 0.8637,
2948
- "step": 4900
2949
- },
2950
- {
2951
- "epoch": 7.1,
2952
- "learning_rate": 0.0002,
2953
- "loss": 0.8098,
2954
- "step": 4910
2955
- },
2956
- {
2957
- "epoch": 7.11,
2958
- "learning_rate": 0.0002,
2959
- "loss": 0.7572,
2960
- "step": 4920
2961
- },
2962
- {
2963
- "epoch": 7.13,
2964
- "learning_rate": 0.0002,
2965
- "loss": 0.7368,
2966
- "step": 4930
2967
- },
2968
- {
2969
- "epoch": 7.14,
2970
- "learning_rate": 0.0002,
2971
- "loss": 0.6251,
2972
- "step": 4940
2973
- },
2974
- {
2975
- "epoch": 7.16,
2976
- "learning_rate": 0.0002,
2977
- "loss": 0.8433,
2978
- "step": 4950
2979
- },
2980
- {
2981
- "epoch": 7.17,
2982
- "learning_rate": 0.0002,
2983
- "loss": 0.8094,
2984
- "step": 4960
2985
- },
2986
- {
2987
- "epoch": 7.19,
2988
- "learning_rate": 0.0002,
2989
- "loss": 0.7691,
2990
- "step": 4970
2991
- },
2992
- {
2993
- "epoch": 7.2,
2994
- "learning_rate": 0.0002,
2995
- "loss": 0.7159,
2996
- "step": 4980
2997
- },
2998
- {
2999
- "epoch": 7.22,
3000
- "learning_rate": 0.0002,
3001
- "loss": 0.6349,
3002
- "step": 4990
3003
- },
3004
- {
3005
- "epoch": 7.23,
3006
- "learning_rate": 0.0002,
3007
- "loss": 0.854,
3008
- "step": 5000
3009
  }
3010
  ],
3011
  "max_steps": 5000,
3012
  "num_train_epochs": 8,
3013
- "total_flos": 1.0738578956363366e+18,
3014
  "trial_name": null,
3015
  "trial_params": null
3016
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.3383947939262475,
5
+ "global_step": 3000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
1806
  "learning_rate": 0.0002,
1807
  "loss": 0.8549,
1808
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1809
  }
1810
  ],
1811
  "max_steps": 5000,
1812
  "num_train_epochs": 8,
1813
+ "total_flos": 6.44896481399808e+17,
1814
  "trial_name": null,
1815
  "trial_params": null
1816
  }