Vishal24 commited on
Commit
593f649
·
verified ·
1 Parent(s): db33fee

Upload checkpoint-258060

Browse files
Files changed (5) hide show
  1. optimizer.pt +1 -1
  2. rng_state.pth +1 -1
  3. scheduler.pt +1 -1
  4. trainer_state.json +1891 -5
  5. training_args.bin +1 -1
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b930d5bcbe3df6d5ee9e15433eeacb1cff6f66fe5522d292d659eb4c38da3c0
3
  size 866895354
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf389c91b851e668d5e63f6f7c5b36c73e641d8e4b093e49b672a0d77b0c5abe
3
  size 866895354
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa280e2e6dbd3a37c8dc2dc5fe9c7782cf7832367d7c106285ca3d9da6e271f7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc23f9a9f7aa172955396035c69940d79883c5359b47c42257084c82f32f20ed
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fe476619a2e0e15fce441e7fcb5a118b511efee40d7b844292db4a9762411b5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52acb6d543ff5fdb99c731bea89a5d2499fee1cdf9577497042d302270267fa2
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 10.0,
5
  "eval_steps": 500,
6
- "global_step": 129030,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1893,12 +1893,1898 @@
1893
  "eval_samples_per_second": 797.168,
1894
  "eval_steps_per_second": 12.46,
1895
  "step": 129030
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1896
  }
1897
  ],
1898
  "logging_steps": 500,
1899
- "max_steps": 129030,
1900
  "num_input_tokens_seen": 0,
1901
- "num_train_epochs": 10,
1902
  "save_steps": 500,
1903
  "stateful_callbacks": {
1904
  "TrainerControl": {
@@ -1912,7 +3798,7 @@
1912
  "attributes": {}
1913
  }
1914
  },
1915
- "total_flos": 3.235049087048045e+17,
1916
  "train_batch_size": 64,
1917
  "trial_name": null,
1918
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 20.0,
5
  "eval_steps": 500,
6
+ "global_step": 258060,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1893
  "eval_samples_per_second": 797.168,
1894
  "eval_steps_per_second": 12.46,
1895
  "step": 129030
1896
+ },
1897
+ {
1898
+ "epoch": 10.036425637448655,
1899
+ "grad_norm": 6.090233325958252,
1900
+ "learning_rate": 9.963574362551346e-06,
1901
+ "loss": 2.4784,
1902
+ "step": 129500
1903
+ },
1904
+ {
1905
+ "epoch": 10.075176315585523,
1906
+ "grad_norm": 6.285606384277344,
1907
+ "learning_rate": 9.924823684414478e-06,
1908
+ "loss": 2.4657,
1909
+ "step": 130000
1910
+ },
1911
+ {
1912
+ "epoch": 10.11392699372239,
1913
+ "grad_norm": 5.937399864196777,
1914
+ "learning_rate": 9.886073006277611e-06,
1915
+ "loss": 2.4869,
1916
+ "step": 130500
1917
+ },
1918
+ {
1919
+ "epoch": 10.152677671859257,
1920
+ "grad_norm": 7.235742568969727,
1921
+ "learning_rate": 9.847322328140743e-06,
1922
+ "loss": 2.4726,
1923
+ "step": 131000
1924
+ },
1925
+ {
1926
+ "epoch": 10.191428349996125,
1927
+ "grad_norm": 6.6334028244018555,
1928
+ "learning_rate": 9.808571650003877e-06,
1929
+ "loss": 2.472,
1930
+ "step": 131500
1931
+ },
1932
+ {
1933
+ "epoch": 10.230179028132993,
1934
+ "grad_norm": 7.366402626037598,
1935
+ "learning_rate": 9.769820971867009e-06,
1936
+ "loss": 2.4887,
1937
+ "step": 132000
1938
+ },
1939
+ {
1940
+ "epoch": 10.26892970626986,
1941
+ "grad_norm": 6.17592716217041,
1942
+ "learning_rate": 9.731070293730142e-06,
1943
+ "loss": 2.4854,
1944
+ "step": 132500
1945
+ },
1946
+ {
1947
+ "epoch": 10.307680384406726,
1948
+ "grad_norm": 6.376716613769531,
1949
+ "learning_rate": 9.692319615593274e-06,
1950
+ "loss": 2.486,
1951
+ "step": 133000
1952
+ },
1953
+ {
1954
+ "epoch": 10.346431062543594,
1955
+ "grad_norm": 6.293849945068359,
1956
+ "learning_rate": 9.653568937456407e-06,
1957
+ "loss": 2.4707,
1958
+ "step": 133500
1959
+ },
1960
+ {
1961
+ "epoch": 10.385181740680462,
1962
+ "grad_norm": 6.606166839599609,
1963
+ "learning_rate": 9.614818259319539e-06,
1964
+ "loss": 2.4704,
1965
+ "step": 134000
1966
+ },
1967
+ {
1968
+ "epoch": 10.42393241881733,
1969
+ "grad_norm": 6.805929660797119,
1970
+ "learning_rate": 9.576067581182673e-06,
1971
+ "loss": 2.4727,
1972
+ "step": 134500
1973
+ },
1974
+ {
1975
+ "epoch": 10.462683096954196,
1976
+ "grad_norm": 6.598349571228027,
1977
+ "learning_rate": 9.537316903045804e-06,
1978
+ "loss": 2.4825,
1979
+ "step": 135000
1980
+ },
1981
+ {
1982
+ "epoch": 10.501433775091064,
1983
+ "grad_norm": 5.807904243469238,
1984
+ "learning_rate": 9.498566224908938e-06,
1985
+ "loss": 2.4721,
1986
+ "step": 135500
1987
+ },
1988
+ {
1989
+ "epoch": 10.540184453227932,
1990
+ "grad_norm": 6.681980609893799,
1991
+ "learning_rate": 9.45981554677207e-06,
1992
+ "loss": 2.4764,
1993
+ "step": 136000
1994
+ },
1995
+ {
1996
+ "epoch": 10.5789351313648,
1997
+ "grad_norm": 6.540719032287598,
1998
+ "learning_rate": 9.421064868635203e-06,
1999
+ "loss": 2.4545,
2000
+ "step": 136500
2001
+ },
2002
+ {
2003
+ "epoch": 10.617685809501666,
2004
+ "grad_norm": 6.627035140991211,
2005
+ "learning_rate": 9.382314190498335e-06,
2006
+ "loss": 2.4778,
2007
+ "step": 137000
2008
+ },
2009
+ {
2010
+ "epoch": 10.656436487638533,
2011
+ "grad_norm": 6.348284721374512,
2012
+ "learning_rate": 9.343563512361468e-06,
2013
+ "loss": 2.4597,
2014
+ "step": 137500
2015
+ },
2016
+ {
2017
+ "epoch": 10.695187165775401,
2018
+ "grad_norm": 6.790314197540283,
2019
+ "learning_rate": 9.3048128342246e-06,
2020
+ "loss": 2.471,
2021
+ "step": 138000
2022
+ },
2023
+ {
2024
+ "epoch": 10.733937843912269,
2025
+ "grad_norm": 6.8181233406066895,
2026
+ "learning_rate": 9.266062156087732e-06,
2027
+ "loss": 2.4571,
2028
+ "step": 138500
2029
+ },
2030
+ {
2031
+ "epoch": 10.772688522049135,
2032
+ "grad_norm": 6.593683242797852,
2033
+ "learning_rate": 9.227311477950864e-06,
2034
+ "loss": 2.4843,
2035
+ "step": 139000
2036
+ },
2037
+ {
2038
+ "epoch": 10.811439200186003,
2039
+ "grad_norm": 6.600128650665283,
2040
+ "learning_rate": 9.188560799813997e-06,
2041
+ "loss": 2.464,
2042
+ "step": 139500
2043
+ },
2044
+ {
2045
+ "epoch": 10.85018987832287,
2046
+ "grad_norm": 6.368162631988525,
2047
+ "learning_rate": 9.149810121677129e-06,
2048
+ "loss": 2.4598,
2049
+ "step": 140000
2050
+ },
2051
+ {
2052
+ "epoch": 10.888940556459739,
2053
+ "grad_norm": 6.5435943603515625,
2054
+ "learning_rate": 9.111059443540262e-06,
2055
+ "loss": 2.4704,
2056
+ "step": 140500
2057
+ },
2058
+ {
2059
+ "epoch": 10.927691234596605,
2060
+ "grad_norm": 6.06011962890625,
2061
+ "learning_rate": 9.072308765403394e-06,
2062
+ "loss": 2.4514,
2063
+ "step": 141000
2064
+ },
2065
+ {
2066
+ "epoch": 10.966441912733472,
2067
+ "grad_norm": 7.2288689613342285,
2068
+ "learning_rate": 9.033558087266528e-06,
2069
+ "loss": 2.4521,
2070
+ "step": 141500
2071
+ },
2072
+ {
2073
+ "epoch": 11.0,
2074
+ "eval_loss": 2.3912322521209717,
2075
+ "eval_runtime": 258.9953,
2076
+ "eval_samples_per_second": 797.176,
2077
+ "eval_steps_per_second": 12.46,
2078
+ "step": 141933
2079
+ },
2080
+ {
2081
+ "epoch": 11.00519259087034,
2082
+ "grad_norm": 6.698403358459473,
2083
+ "learning_rate": 8.99480740912966e-06,
2084
+ "loss": 2.4457,
2085
+ "step": 142000
2086
+ },
2087
+ {
2088
+ "epoch": 11.043943269007208,
2089
+ "grad_norm": 6.455236911773682,
2090
+ "learning_rate": 8.956056730992793e-06,
2091
+ "loss": 2.4507,
2092
+ "step": 142500
2093
+ },
2094
+ {
2095
+ "epoch": 11.082693947144074,
2096
+ "grad_norm": 6.590576648712158,
2097
+ "learning_rate": 8.917306052855925e-06,
2098
+ "loss": 2.4256,
2099
+ "step": 143000
2100
+ },
2101
+ {
2102
+ "epoch": 11.121444625280942,
2103
+ "grad_norm": 6.957404136657715,
2104
+ "learning_rate": 8.878555374719058e-06,
2105
+ "loss": 2.4549,
2106
+ "step": 143500
2107
+ },
2108
+ {
2109
+ "epoch": 11.16019530341781,
2110
+ "grad_norm": 6.926699161529541,
2111
+ "learning_rate": 8.83980469658219e-06,
2112
+ "loss": 2.4499,
2113
+ "step": 144000
2114
+ },
2115
+ {
2116
+ "epoch": 11.198945981554678,
2117
+ "grad_norm": 6.484086036682129,
2118
+ "learning_rate": 8.801054018445324e-06,
2119
+ "loss": 2.4443,
2120
+ "step": 144500
2121
+ },
2122
+ {
2123
+ "epoch": 11.237696659691544,
2124
+ "grad_norm": 6.107706069946289,
2125
+ "learning_rate": 8.762303340308455e-06,
2126
+ "loss": 2.4459,
2127
+ "step": 145000
2128
+ },
2129
+ {
2130
+ "epoch": 11.276447337828412,
2131
+ "grad_norm": 7.301278591156006,
2132
+ "learning_rate": 8.723552662171589e-06,
2133
+ "loss": 2.4463,
2134
+ "step": 145500
2135
+ },
2136
+ {
2137
+ "epoch": 11.31519801596528,
2138
+ "grad_norm": 6.378045082092285,
2139
+ "learning_rate": 8.68480198403472e-06,
2140
+ "loss": 2.4494,
2141
+ "step": 146000
2142
+ },
2143
+ {
2144
+ "epoch": 11.353948694102147,
2145
+ "grad_norm": 6.803300857543945,
2146
+ "learning_rate": 8.646051305897854e-06,
2147
+ "loss": 2.4235,
2148
+ "step": 146500
2149
+ },
2150
+ {
2151
+ "epoch": 11.392699372239015,
2152
+ "grad_norm": 6.401794910430908,
2153
+ "learning_rate": 8.607300627760986e-06,
2154
+ "loss": 2.4353,
2155
+ "step": 147000
2156
+ },
2157
+ {
2158
+ "epoch": 11.431450050375881,
2159
+ "grad_norm": 6.455550193786621,
2160
+ "learning_rate": 8.56854994962412e-06,
2161
+ "loss": 2.4306,
2162
+ "step": 147500
2163
+ },
2164
+ {
2165
+ "epoch": 11.470200728512749,
2166
+ "grad_norm": 6.416442394256592,
2167
+ "learning_rate": 8.529799271487251e-06,
2168
+ "loss": 2.4143,
2169
+ "step": 148000
2170
+ },
2171
+ {
2172
+ "epoch": 11.508951406649617,
2173
+ "grad_norm": 6.768812656402588,
2174
+ "learning_rate": 8.491048593350385e-06,
2175
+ "loss": 2.4184,
2176
+ "step": 148500
2177
+ },
2178
+ {
2179
+ "epoch": 11.547702084786483,
2180
+ "grad_norm": 6.085323810577393,
2181
+ "learning_rate": 8.452297915213516e-06,
2182
+ "loss": 2.4318,
2183
+ "step": 149000
2184
+ },
2185
+ {
2186
+ "epoch": 11.58645276292335,
2187
+ "grad_norm": 6.181857585906982,
2188
+ "learning_rate": 8.41354723707665e-06,
2189
+ "loss": 2.4348,
2190
+ "step": 149500
2191
+ },
2192
+ {
2193
+ "epoch": 11.625203441060219,
2194
+ "grad_norm": 6.558756351470947,
2195
+ "learning_rate": 8.374796558939782e-06,
2196
+ "loss": 2.413,
2197
+ "step": 150000
2198
+ },
2199
+ {
2200
+ "epoch": 11.663954119197086,
2201
+ "grad_norm": 6.249685287475586,
2202
+ "learning_rate": 8.336045880802915e-06,
2203
+ "loss": 2.4271,
2204
+ "step": 150500
2205
+ },
2206
+ {
2207
+ "epoch": 11.702704797333954,
2208
+ "grad_norm": 6.789103984832764,
2209
+ "learning_rate": 8.297295202666047e-06,
2210
+ "loss": 2.4226,
2211
+ "step": 151000
2212
+ },
2213
+ {
2214
+ "epoch": 11.74145547547082,
2215
+ "grad_norm": 6.4289140701293945,
2216
+ "learning_rate": 8.25854452452918e-06,
2217
+ "loss": 2.4184,
2218
+ "step": 151500
2219
+ },
2220
+ {
2221
+ "epoch": 11.780206153607688,
2222
+ "grad_norm": 6.098612308502197,
2223
+ "learning_rate": 8.219793846392312e-06,
2224
+ "loss": 2.4132,
2225
+ "step": 152000
2226
+ },
2227
+ {
2228
+ "epoch": 11.818956831744556,
2229
+ "grad_norm": 6.500378608703613,
2230
+ "learning_rate": 8.181043168255444e-06,
2231
+ "loss": 2.4184,
2232
+ "step": 152500
2233
+ },
2234
+ {
2235
+ "epoch": 11.857707509881424,
2236
+ "grad_norm": 6.583259105682373,
2237
+ "learning_rate": 8.142292490118577e-06,
2238
+ "loss": 2.4259,
2239
+ "step": 153000
2240
+ },
2241
+ {
2242
+ "epoch": 11.89645818801829,
2243
+ "grad_norm": 6.7018303871154785,
2244
+ "learning_rate": 8.10354181198171e-06,
2245
+ "loss": 2.4185,
2246
+ "step": 153500
2247
+ },
2248
+ {
2249
+ "epoch": 11.935208866155158,
2250
+ "grad_norm": 6.679374694824219,
2251
+ "learning_rate": 8.064791133844843e-06,
2252
+ "loss": 2.4078,
2253
+ "step": 154000
2254
+ },
2255
+ {
2256
+ "epoch": 11.973959544292025,
2257
+ "grad_norm": 6.576003551483154,
2258
+ "learning_rate": 8.026040455707974e-06,
2259
+ "loss": 2.4212,
2260
+ "step": 154500
2261
+ },
2262
+ {
2263
+ "epoch": 12.0,
2264
+ "eval_loss": 2.3491039276123047,
2265
+ "eval_runtime": 260.4232,
2266
+ "eval_samples_per_second": 792.806,
2267
+ "eval_steps_per_second": 12.391,
2268
+ "step": 154836
2269
+ },
2270
+ {
2271
+ "epoch": 12.012710222428893,
2272
+ "grad_norm": 6.768045902252197,
2273
+ "learning_rate": 7.987289777571108e-06,
2274
+ "loss": 2.399,
2275
+ "step": 155000
2276
+ },
2277
+ {
2278
+ "epoch": 12.05146090056576,
2279
+ "grad_norm": 6.445169925689697,
2280
+ "learning_rate": 7.94853909943424e-06,
2281
+ "loss": 2.4055,
2282
+ "step": 155500
2283
+ },
2284
+ {
2285
+ "epoch": 12.090211578702627,
2286
+ "grad_norm": 6.684764385223389,
2287
+ "learning_rate": 7.909788421297373e-06,
2288
+ "loss": 2.3979,
2289
+ "step": 156000
2290
+ },
2291
+ {
2292
+ "epoch": 12.128962256839495,
2293
+ "grad_norm": 7.150822162628174,
2294
+ "learning_rate": 7.871037743160505e-06,
2295
+ "loss": 2.4091,
2296
+ "step": 156500
2297
+ },
2298
+ {
2299
+ "epoch": 12.167712934976363,
2300
+ "grad_norm": 6.7067131996154785,
2301
+ "learning_rate": 7.832287065023639e-06,
2302
+ "loss": 2.4057,
2303
+ "step": 157000
2304
+ },
2305
+ {
2306
+ "epoch": 12.206463613113229,
2307
+ "grad_norm": 6.288236141204834,
2308
+ "learning_rate": 7.79353638688677e-06,
2309
+ "loss": 2.4024,
2310
+ "step": 157500
2311
+ },
2312
+ {
2313
+ "epoch": 12.245214291250097,
2314
+ "grad_norm": 6.532754898071289,
2315
+ "learning_rate": 7.754785708749904e-06,
2316
+ "loss": 2.4119,
2317
+ "step": 158000
2318
+ },
2319
+ {
2320
+ "epoch": 12.283964969386965,
2321
+ "grad_norm": 6.437507629394531,
2322
+ "learning_rate": 7.716035030613036e-06,
2323
+ "loss": 2.4048,
2324
+ "step": 158500
2325
+ },
2326
+ {
2327
+ "epoch": 12.322715647523832,
2328
+ "grad_norm": 6.648064136505127,
2329
+ "learning_rate": 7.677284352476169e-06,
2330
+ "loss": 2.3954,
2331
+ "step": 159000
2332
+ },
2333
+ {
2334
+ "epoch": 12.361466325660698,
2335
+ "grad_norm": 6.406070232391357,
2336
+ "learning_rate": 7.6385336743393e-06,
2337
+ "loss": 2.4069,
2338
+ "step": 159500
2339
+ },
2340
+ {
2341
+ "epoch": 12.400217003797566,
2342
+ "grad_norm": 6.75925350189209,
2343
+ "learning_rate": 7.5997829962024335e-06,
2344
+ "loss": 2.3803,
2345
+ "step": 160000
2346
+ },
2347
+ {
2348
+ "epoch": 12.438967681934434,
2349
+ "grad_norm": 7.390876770019531,
2350
+ "learning_rate": 7.561032318065566e-06,
2351
+ "loss": 2.3952,
2352
+ "step": 160500
2353
+ },
2354
+ {
2355
+ "epoch": 12.477718360071302,
2356
+ "grad_norm": 6.584438800811768,
2357
+ "learning_rate": 7.522281639928699e-06,
2358
+ "loss": 2.3921,
2359
+ "step": 161000
2360
+ },
2361
+ {
2362
+ "epoch": 12.516469038208168,
2363
+ "grad_norm": 6.7814040184021,
2364
+ "learning_rate": 7.483530961791831e-06,
2365
+ "loss": 2.4035,
2366
+ "step": 161500
2367
+ },
2368
+ {
2369
+ "epoch": 12.555219716345036,
2370
+ "grad_norm": 6.544926166534424,
2371
+ "learning_rate": 7.444780283654964e-06,
2372
+ "loss": 2.3855,
2373
+ "step": 162000
2374
+ },
2375
+ {
2376
+ "epoch": 12.593970394481904,
2377
+ "grad_norm": 6.649155139923096,
2378
+ "learning_rate": 7.406029605518097e-06,
2379
+ "loss": 2.3884,
2380
+ "step": 162500
2381
+ },
2382
+ {
2383
+ "epoch": 12.632721072618772,
2384
+ "grad_norm": 6.128752708435059,
2385
+ "learning_rate": 7.367278927381229e-06,
2386
+ "loss": 2.3915,
2387
+ "step": 163000
2388
+ },
2389
+ {
2390
+ "epoch": 12.671471750755638,
2391
+ "grad_norm": 6.694360733032227,
2392
+ "learning_rate": 7.328528249244362e-06,
2393
+ "loss": 2.4065,
2394
+ "step": 163500
2395
+ },
2396
+ {
2397
+ "epoch": 12.710222428892505,
2398
+ "grad_norm": 6.9979963302612305,
2399
+ "learning_rate": 7.2897775711074945e-06,
2400
+ "loss": 2.3816,
2401
+ "step": 164000
2402
+ },
2403
+ {
2404
+ "epoch": 12.748973107029373,
2405
+ "grad_norm": 6.7657294273376465,
2406
+ "learning_rate": 7.251026892970627e-06,
2407
+ "loss": 2.385,
2408
+ "step": 164500
2409
+ },
2410
+ {
2411
+ "epoch": 12.787723785166241,
2412
+ "grad_norm": 7.142265796661377,
2413
+ "learning_rate": 7.21227621483376e-06,
2414
+ "loss": 2.3809,
2415
+ "step": 165000
2416
+ },
2417
+ {
2418
+ "epoch": 12.826474463303107,
2419
+ "grad_norm": 6.2213134765625,
2420
+ "learning_rate": 7.1735255366968924e-06,
2421
+ "loss": 2.3883,
2422
+ "step": 165500
2423
+ },
2424
+ {
2425
+ "epoch": 12.865225141439975,
2426
+ "grad_norm": 6.274342060089111,
2427
+ "learning_rate": 7.134774858560025e-06,
2428
+ "loss": 2.3838,
2429
+ "step": 166000
2430
+ },
2431
+ {
2432
+ "epoch": 12.903975819576843,
2433
+ "grad_norm": 6.5893049240112305,
2434
+ "learning_rate": 7.096024180423158e-06,
2435
+ "loss": 2.3832,
2436
+ "step": 166500
2437
+ },
2438
+ {
2439
+ "epoch": 12.94272649771371,
2440
+ "grad_norm": 6.229060173034668,
2441
+ "learning_rate": 7.05727350228629e-06,
2442
+ "loss": 2.3839,
2443
+ "step": 167000
2444
+ },
2445
+ {
2446
+ "epoch": 12.981477175850577,
2447
+ "grad_norm": 7.251420497894287,
2448
+ "learning_rate": 7.018522824149423e-06,
2449
+ "loss": 2.3838,
2450
+ "step": 167500
2451
+ },
2452
+ {
2453
+ "epoch": 13.0,
2454
+ "eval_loss": 2.3215689659118652,
2455
+ "eval_runtime": 259.7568,
2456
+ "eval_samples_per_second": 794.84,
2457
+ "eval_steps_per_second": 12.423,
2458
+ "step": 167739
2459
+ },
2460
+ {
2461
+ "epoch": 13.020227853987445,
2462
+ "grad_norm": 5.944735050201416,
2463
+ "learning_rate": 6.979772146012556e-06,
2464
+ "loss": 2.3687,
2465
+ "step": 168000
2466
+ },
2467
+ {
2468
+ "epoch": 13.058978532124312,
2469
+ "grad_norm": 6.25685977935791,
2470
+ "learning_rate": 6.941021467875688e-06,
2471
+ "loss": 2.3761,
2472
+ "step": 168500
2473
+ },
2474
+ {
2475
+ "epoch": 13.09772921026118,
2476
+ "grad_norm": 6.244680881500244,
2477
+ "learning_rate": 6.902270789738821e-06,
2478
+ "loss": 2.3463,
2479
+ "step": 169000
2480
+ },
2481
+ {
2482
+ "epoch": 13.136479888398046,
2483
+ "grad_norm": 6.370804309844971,
2484
+ "learning_rate": 6.8635201116019535e-06,
2485
+ "loss": 2.3597,
2486
+ "step": 169500
2487
+ },
2488
+ {
2489
+ "epoch": 13.175230566534914,
2490
+ "grad_norm": 6.249234676361084,
2491
+ "learning_rate": 6.824769433465086e-06,
2492
+ "loss": 2.3679,
2493
+ "step": 170000
2494
+ },
2495
+ {
2496
+ "epoch": 13.213981244671782,
2497
+ "grad_norm": 6.973300933837891,
2498
+ "learning_rate": 6.786018755328219e-06,
2499
+ "loss": 2.3669,
2500
+ "step": 170500
2501
+ },
2502
+ {
2503
+ "epoch": 13.25273192280865,
2504
+ "grad_norm": 7.319492816925049,
2505
+ "learning_rate": 6.747268077191351e-06,
2506
+ "loss": 2.3528,
2507
+ "step": 171000
2508
+ },
2509
+ {
2510
+ "epoch": 13.291482600945516,
2511
+ "grad_norm": 6.924526214599609,
2512
+ "learning_rate": 6.708517399054484e-06,
2513
+ "loss": 2.3662,
2514
+ "step": 171500
2515
+ },
2516
+ {
2517
+ "epoch": 13.330233279082384,
2518
+ "grad_norm": 6.761091709136963,
2519
+ "learning_rate": 6.669766720917617e-06,
2520
+ "loss": 2.3608,
2521
+ "step": 172000
2522
+ },
2523
+ {
2524
+ "epoch": 13.368983957219251,
2525
+ "grad_norm": 6.105197429656982,
2526
+ "learning_rate": 6.631016042780749e-06,
2527
+ "loss": 2.3536,
2528
+ "step": 172500
2529
+ },
2530
+ {
2531
+ "epoch": 13.40773463535612,
2532
+ "grad_norm": 6.724457740783691,
2533
+ "learning_rate": 6.592265364643882e-06,
2534
+ "loss": 2.3682,
2535
+ "step": 173000
2536
+ },
2537
+ {
2538
+ "epoch": 13.446485313492985,
2539
+ "grad_norm": 6.62090539932251,
2540
+ "learning_rate": 6.553514686507015e-06,
2541
+ "loss": 2.3549,
2542
+ "step": 173500
2543
+ },
2544
+ {
2545
+ "epoch": 13.485235991629853,
2546
+ "grad_norm": 6.862425327301025,
2547
+ "learning_rate": 6.514764008370147e-06,
2548
+ "loss": 2.3475,
2549
+ "step": 174000
2550
+ },
2551
+ {
2552
+ "epoch": 13.523986669766721,
2553
+ "grad_norm": 6.164032936096191,
2554
+ "learning_rate": 6.47601333023328e-06,
2555
+ "loss": 2.3625,
2556
+ "step": 174500
2557
+ },
2558
+ {
2559
+ "epoch": 13.562737347903589,
2560
+ "grad_norm": 7.522220134735107,
2561
+ "learning_rate": 6.4372626520964125e-06,
2562
+ "loss": 2.3676,
2563
+ "step": 175000
2564
+ },
2565
+ {
2566
+ "epoch": 13.601488026040455,
2567
+ "grad_norm": 6.564206600189209,
2568
+ "learning_rate": 6.398511973959545e-06,
2569
+ "loss": 2.3606,
2570
+ "step": 175500
2571
+ },
2572
+ {
2573
+ "epoch": 13.640238704177323,
2574
+ "grad_norm": 6.069074630737305,
2575
+ "learning_rate": 6.359761295822677e-06,
2576
+ "loss": 2.3644,
2577
+ "step": 176000
2578
+ },
2579
+ {
2580
+ "epoch": 13.67898938231419,
2581
+ "grad_norm": 6.570771217346191,
2582
+ "learning_rate": 6.3210106176858095e-06,
2583
+ "loss": 2.3711,
2584
+ "step": 176500
2585
+ },
2586
+ {
2587
+ "epoch": 13.717740060451058,
2588
+ "grad_norm": 6.1281609535217285,
2589
+ "learning_rate": 6.282259939548942e-06,
2590
+ "loss": 2.348,
2591
+ "step": 177000
2592
+ },
2593
+ {
2594
+ "epoch": 13.756490738587924,
2595
+ "grad_norm": 6.176905632019043,
2596
+ "learning_rate": 6.243509261412075e-06,
2597
+ "loss": 2.379,
2598
+ "step": 177500
2599
+ },
2600
+ {
2601
+ "epoch": 13.795241416724792,
2602
+ "grad_norm": 7.890781402587891,
2603
+ "learning_rate": 6.2047585832752074e-06,
2604
+ "loss": 2.365,
2605
+ "step": 178000
2606
+ },
2607
+ {
2608
+ "epoch": 13.83399209486166,
2609
+ "grad_norm": 6.160940647125244,
2610
+ "learning_rate": 6.16600790513834e-06,
2611
+ "loss": 2.3391,
2612
+ "step": 178500
2613
+ },
2614
+ {
2615
+ "epoch": 13.872742772998528,
2616
+ "grad_norm": 6.732828617095947,
2617
+ "learning_rate": 6.127257227001473e-06,
2618
+ "loss": 2.355,
2619
+ "step": 179000
2620
+ },
2621
+ {
2622
+ "epoch": 13.911493451135394,
2623
+ "grad_norm": 6.500529766082764,
2624
+ "learning_rate": 6.088506548864605e-06,
2625
+ "loss": 2.3512,
2626
+ "step": 179500
2627
+ },
2628
+ {
2629
+ "epoch": 13.950244129272262,
2630
+ "grad_norm": 7.362790584564209,
2631
+ "learning_rate": 6.049755870727738e-06,
2632
+ "loss": 2.3654,
2633
+ "step": 180000
2634
+ },
2635
+ {
2636
+ "epoch": 13.98899480740913,
2637
+ "grad_norm": 7.070291519165039,
2638
+ "learning_rate": 6.011005192590871e-06,
2639
+ "loss": 2.3444,
2640
+ "step": 180500
2641
+ },
2642
+ {
2643
+ "epoch": 14.0,
2644
+ "eval_loss": 2.2924630641937256,
2645
+ "eval_runtime": 259.3076,
2646
+ "eval_samples_per_second": 796.217,
2647
+ "eval_steps_per_second": 12.445,
2648
+ "step": 180642
2649
+ },
2650
+ {
2651
+ "epoch": 14.027745485545998,
2652
+ "grad_norm": 7.284486293792725,
2653
+ "learning_rate": 5.972254514454003e-06,
2654
+ "loss": 2.3296,
2655
+ "step": 181000
2656
+ },
2657
+ {
2658
+ "epoch": 14.066496163682864,
2659
+ "grad_norm": 7.636621952056885,
2660
+ "learning_rate": 5.933503836317136e-06,
2661
+ "loss": 2.3314,
2662
+ "step": 181500
2663
+ },
2664
+ {
2665
+ "epoch": 14.105246841819731,
2666
+ "grad_norm": 6.692602634429932,
2667
+ "learning_rate": 5.8947531581802685e-06,
2668
+ "loss": 2.3363,
2669
+ "step": 182000
2670
+ },
2671
+ {
2672
+ "epoch": 14.1439975199566,
2673
+ "grad_norm": 6.751750469207764,
2674
+ "learning_rate": 5.856002480043401e-06,
2675
+ "loss": 2.3174,
2676
+ "step": 182500
2677
+ },
2678
+ {
2679
+ "epoch": 14.182748198093467,
2680
+ "grad_norm": 7.041817665100098,
2681
+ "learning_rate": 5.817251801906534e-06,
2682
+ "loss": 2.3295,
2683
+ "step": 183000
2684
+ },
2685
+ {
2686
+ "epoch": 14.221498876230335,
2687
+ "grad_norm": 7.414912700653076,
2688
+ "learning_rate": 5.778501123769666e-06,
2689
+ "loss": 2.3386,
2690
+ "step": 183500
2691
+ },
2692
+ {
2693
+ "epoch": 14.260249554367201,
2694
+ "grad_norm": 7.009491920471191,
2695
+ "learning_rate": 5.739750445632799e-06,
2696
+ "loss": 2.3282,
2697
+ "step": 184000
2698
+ },
2699
+ {
2700
+ "epoch": 14.299000232504069,
2701
+ "grad_norm": 6.77699089050293,
2702
+ "learning_rate": 5.700999767495932e-06,
2703
+ "loss": 2.3323,
2704
+ "step": 184500
2705
+ },
2706
+ {
2707
+ "epoch": 14.337750910640937,
2708
+ "grad_norm": 6.922458171844482,
2709
+ "learning_rate": 5.662249089359064e-06,
2710
+ "loss": 2.3545,
2711
+ "step": 185000
2712
+ },
2713
+ {
2714
+ "epoch": 14.376501588777803,
2715
+ "grad_norm": 7.635495185852051,
2716
+ "learning_rate": 5.623498411222197e-06,
2717
+ "loss": 2.3429,
2718
+ "step": 185500
2719
+ },
2720
+ {
2721
+ "epoch": 14.41525226691467,
2722
+ "grad_norm": 6.657200813293457,
2723
+ "learning_rate": 5.58474773308533e-06,
2724
+ "loss": 2.3371,
2725
+ "step": 186000
2726
+ },
2727
+ {
2728
+ "epoch": 14.454002945051538,
2729
+ "grad_norm": 6.328368663787842,
2730
+ "learning_rate": 5.545997054948462e-06,
2731
+ "loss": 2.3225,
2732
+ "step": 186500
2733
+ },
2734
+ {
2735
+ "epoch": 14.492753623188406,
2736
+ "grad_norm": 6.7084503173828125,
2737
+ "learning_rate": 5.507246376811595e-06,
2738
+ "loss": 2.3141,
2739
+ "step": 187000
2740
+ },
2741
+ {
2742
+ "epoch": 14.531504301325274,
2743
+ "grad_norm": 6.23046875,
2744
+ "learning_rate": 5.4684956986747275e-06,
2745
+ "loss": 2.3387,
2746
+ "step": 187500
2747
+ },
2748
+ {
2749
+ "epoch": 14.57025497946214,
2750
+ "grad_norm": 6.53918981552124,
2751
+ "learning_rate": 5.42974502053786e-06,
2752
+ "loss": 2.3355,
2753
+ "step": 188000
2754
+ },
2755
+ {
2756
+ "epoch": 14.609005657599008,
2757
+ "grad_norm": 6.816432952880859,
2758
+ "learning_rate": 5.390994342400993e-06,
2759
+ "loss": 2.3409,
2760
+ "step": 188500
2761
+ },
2762
+ {
2763
+ "epoch": 14.647756335735876,
2764
+ "grad_norm": 6.9504475593566895,
2765
+ "learning_rate": 5.352243664264125e-06,
2766
+ "loss": 2.3274,
2767
+ "step": 189000
2768
+ },
2769
+ {
2770
+ "epoch": 14.686507013872744,
2771
+ "grad_norm": 7.058226585388184,
2772
+ "learning_rate": 5.313492986127258e-06,
2773
+ "loss": 2.3295,
2774
+ "step": 189500
2775
+ },
2776
+ {
2777
+ "epoch": 14.72525769200961,
2778
+ "grad_norm": 6.337547302246094,
2779
+ "learning_rate": 5.274742307990391e-06,
2780
+ "loss": 2.316,
2781
+ "step": 190000
2782
+ },
2783
+ {
2784
+ "epoch": 14.764008370146477,
2785
+ "grad_norm": 7.420670032501221,
2786
+ "learning_rate": 5.235991629853523e-06,
2787
+ "loss": 2.3313,
2788
+ "step": 190500
2789
+ },
2790
+ {
2791
+ "epoch": 14.802759048283345,
2792
+ "grad_norm": 6.559388637542725,
2793
+ "learning_rate": 5.197240951716656e-06,
2794
+ "loss": 2.3368,
2795
+ "step": 191000
2796
+ },
2797
+ {
2798
+ "epoch": 14.841509726420213,
2799
+ "grad_norm": 6.416265487670898,
2800
+ "learning_rate": 5.1584902735797886e-06,
2801
+ "loss": 2.3139,
2802
+ "step": 191500
2803
+ },
2804
+ {
2805
+ "epoch": 14.88026040455708,
2806
+ "grad_norm": 6.204991817474365,
2807
+ "learning_rate": 5.11973959544292e-06,
2808
+ "loss": 2.3209,
2809
+ "step": 192000
2810
+ },
2811
+ {
2812
+ "epoch": 14.919011082693947,
2813
+ "grad_norm": 7.657558441162109,
2814
+ "learning_rate": 5.080988917306053e-06,
2815
+ "loss": 2.3346,
2816
+ "step": 192500
2817
+ },
2818
+ {
2819
+ "epoch": 14.957761760830815,
2820
+ "grad_norm": 6.812448024749756,
2821
+ "learning_rate": 5.042238239169186e-06,
2822
+ "loss": 2.3226,
2823
+ "step": 193000
2824
+ },
2825
+ {
2826
+ "epoch": 14.996512438967683,
2827
+ "grad_norm": 5.866453170776367,
2828
+ "learning_rate": 5.003487561032318e-06,
2829
+ "loss": 2.3034,
2830
+ "step": 193500
2831
+ },
2832
+ {
2833
+ "epoch": 15.0,
2834
+ "eval_loss": 2.2758021354675293,
2835
+ "eval_runtime": 268.9287,
2836
+ "eval_samples_per_second": 767.731,
2837
+ "eval_steps_per_second": 11.999,
2838
+ "step": 193545
2839
+ },
2840
+ {
2841
+ "epoch": 15.035263117104549,
2842
+ "grad_norm": 6.998913288116455,
2843
+ "learning_rate": 4.964736882895451e-06,
2844
+ "loss": 2.3103,
2845
+ "step": 194000
2846
+ },
2847
+ {
2848
+ "epoch": 15.074013795241417,
2849
+ "grad_norm": 7.022980213165283,
2850
+ "learning_rate": 4.9259862047585835e-06,
2851
+ "loss": 2.3121,
2852
+ "step": 194500
2853
+ },
2854
+ {
2855
+ "epoch": 15.112764473378284,
2856
+ "grad_norm": 6.3553056716918945,
2857
+ "learning_rate": 4.887235526621716e-06,
2858
+ "loss": 2.325,
2859
+ "step": 195000
2860
+ },
2861
+ {
2862
+ "epoch": 15.151515151515152,
2863
+ "grad_norm": 7.574887752532959,
2864
+ "learning_rate": 4.848484848484849e-06,
2865
+ "loss": 2.3128,
2866
+ "step": 195500
2867
+ },
2868
+ {
2869
+ "epoch": 15.190265829652018,
2870
+ "grad_norm": 6.3977556228637695,
2871
+ "learning_rate": 4.809734170347981e-06,
2872
+ "loss": 2.3058,
2873
+ "step": 196000
2874
+ },
2875
+ {
2876
+ "epoch": 15.229016507788886,
2877
+ "grad_norm": 6.198862552642822,
2878
+ "learning_rate": 4.770983492211114e-06,
2879
+ "loss": 2.3111,
2880
+ "step": 196500
2881
+ },
2882
+ {
2883
+ "epoch": 15.267767185925754,
2884
+ "grad_norm": 7.1892499923706055,
2885
+ "learning_rate": 4.732232814074247e-06,
2886
+ "loss": 2.3181,
2887
+ "step": 197000
2888
+ },
2889
+ {
2890
+ "epoch": 15.306517864062622,
2891
+ "grad_norm": 6.773824214935303,
2892
+ "learning_rate": 4.693482135937379e-06,
2893
+ "loss": 2.3158,
2894
+ "step": 197500
2895
+ },
2896
+ {
2897
+ "epoch": 15.345268542199488,
2898
+ "grad_norm": 6.595972537994385,
2899
+ "learning_rate": 4.654731457800512e-06,
2900
+ "loss": 2.2989,
2901
+ "step": 198000
2902
+ },
2903
+ {
2904
+ "epoch": 15.384019220336356,
2905
+ "grad_norm": 7.397641658782959,
2906
+ "learning_rate": 4.615980779663645e-06,
2907
+ "loss": 2.3143,
2908
+ "step": 198500
2909
+ },
2910
+ {
2911
+ "epoch": 15.422769898473224,
2912
+ "grad_norm": 7.2511820793151855,
2913
+ "learning_rate": 4.577230101526777e-06,
2914
+ "loss": 2.3077,
2915
+ "step": 199000
2916
+ },
2917
+ {
2918
+ "epoch": 15.461520576610091,
2919
+ "grad_norm": 6.52310848236084,
2920
+ "learning_rate": 4.53847942338991e-06,
2921
+ "loss": 2.3062,
2922
+ "step": 199500
2923
+ },
2924
+ {
2925
+ "epoch": 15.500271254746957,
2926
+ "grad_norm": 6.681788921356201,
2927
+ "learning_rate": 4.4997287452530425e-06,
2928
+ "loss": 2.3078,
2929
+ "step": 200000
2930
+ },
2931
+ {
2932
+ "epoch": 15.539021932883825,
2933
+ "grad_norm": 7.010565280914307,
2934
+ "learning_rate": 4.460978067116175e-06,
2935
+ "loss": 2.3031,
2936
+ "step": 200500
2937
+ },
2938
+ {
2939
+ "epoch": 15.577772611020693,
2940
+ "grad_norm": 7.412187576293945,
2941
+ "learning_rate": 4.422227388979308e-06,
2942
+ "loss": 2.3029,
2943
+ "step": 201000
2944
+ },
2945
+ {
2946
+ "epoch": 15.616523289157561,
2947
+ "grad_norm": 6.586581707000732,
2948
+ "learning_rate": 4.38347671084244e-06,
2949
+ "loss": 2.3092,
2950
+ "step": 201500
2951
+ },
2952
+ {
2953
+ "epoch": 15.655273967294427,
2954
+ "grad_norm": 6.430338382720947,
2955
+ "learning_rate": 4.344726032705573e-06,
2956
+ "loss": 2.2972,
2957
+ "step": 202000
2958
+ },
2959
+ {
2960
+ "epoch": 15.694024645431295,
2961
+ "grad_norm": 6.151809215545654,
2962
+ "learning_rate": 4.305975354568706e-06,
2963
+ "loss": 2.2972,
2964
+ "step": 202500
2965
+ },
2966
+ {
2967
+ "epoch": 15.732775323568163,
2968
+ "grad_norm": 7.195096492767334,
2969
+ "learning_rate": 4.267224676431838e-06,
2970
+ "loss": 2.3045,
2971
+ "step": 203000
2972
+ },
2973
+ {
2974
+ "epoch": 15.77152600170503,
2975
+ "grad_norm": 6.76158332824707,
2976
+ "learning_rate": 4.228473998294971e-06,
2977
+ "loss": 2.2995,
2978
+ "step": 203500
2979
+ },
2980
+ {
2981
+ "epoch": 15.810276679841897,
2982
+ "grad_norm": 6.710601329803467,
2983
+ "learning_rate": 4.1897233201581036e-06,
2984
+ "loss": 2.3045,
2985
+ "step": 204000
2986
+ },
2987
+ {
2988
+ "epoch": 15.849027357978764,
2989
+ "grad_norm": 6.813743591308594,
2990
+ "learning_rate": 4.150972642021236e-06,
2991
+ "loss": 2.3114,
2992
+ "step": 204500
2993
+ },
2994
+ {
2995
+ "epoch": 15.887778036115632,
2996
+ "grad_norm": 7.168315410614014,
2997
+ "learning_rate": 4.112221963884369e-06,
2998
+ "loss": 2.2995,
2999
+ "step": 205000
3000
+ },
3001
+ {
3002
+ "epoch": 15.9265287142525,
3003
+ "grad_norm": 6.606774806976318,
3004
+ "learning_rate": 4.0734712857475015e-06,
3005
+ "loss": 2.3023,
3006
+ "step": 205500
3007
+ },
3008
+ {
3009
+ "epoch": 15.965279392389366,
3010
+ "grad_norm": 6.852230548858643,
3011
+ "learning_rate": 4.034720607610634e-06,
3012
+ "loss": 2.311,
3013
+ "step": 206000
3014
+ },
3015
+ {
3016
+ "epoch": 16.0,
3017
+ "eval_loss": 2.252058982849121,
3018
+ "eval_runtime": 272.2097,
3019
+ "eval_samples_per_second": 758.478,
3020
+ "eval_steps_per_second": 11.855,
3021
+ "step": 206448
3022
+ },
3023
+ {
3024
+ "epoch": 16.004030070526234,
3025
+ "grad_norm": 7.245954990386963,
3026
+ "learning_rate": 3.995969929473767e-06,
3027
+ "loss": 2.2896,
3028
+ "step": 206500
3029
+ },
3030
+ {
3031
+ "epoch": 16.0427807486631,
3032
+ "grad_norm": 6.094116687774658,
3033
+ "learning_rate": 3.957219251336899e-06,
3034
+ "loss": 2.2999,
3035
+ "step": 207000
3036
+ },
3037
+ {
3038
+ "epoch": 16.08153142679997,
3039
+ "grad_norm": 6.302695274353027,
3040
+ "learning_rate": 3.918468573200031e-06,
3041
+ "loss": 2.3017,
3042
+ "step": 207500
3043
+ },
3044
+ {
3045
+ "epoch": 16.120282104936837,
3046
+ "grad_norm": 6.800222873687744,
3047
+ "learning_rate": 3.879717895063164e-06,
3048
+ "loss": 2.2876,
3049
+ "step": 208000
3050
+ },
3051
+ {
3052
+ "epoch": 16.159032783073705,
3053
+ "grad_norm": 7.139950752258301,
3054
+ "learning_rate": 3.840967216926296e-06,
3055
+ "loss": 2.2975,
3056
+ "step": 208500
3057
+ },
3058
+ {
3059
+ "epoch": 16.19778346121057,
3060
+ "grad_norm": 6.805322170257568,
3061
+ "learning_rate": 3.802216538789429e-06,
3062
+ "loss": 2.2994,
3063
+ "step": 209000
3064
+ },
3065
+ {
3066
+ "epoch": 16.236534139347437,
3067
+ "grad_norm": 6.6877336502075195,
3068
+ "learning_rate": 3.7634658606525617e-06,
3069
+ "loss": 2.277,
3070
+ "step": 209500
3071
+ },
3072
+ {
3073
+ "epoch": 16.275284817484305,
3074
+ "grad_norm": 6.0831193923950195,
3075
+ "learning_rate": 3.724715182515694e-06,
3076
+ "loss": 2.3029,
3077
+ "step": 210000
3078
+ },
3079
+ {
3080
+ "epoch": 16.314035495621173,
3081
+ "grad_norm": 6.021631240844727,
3082
+ "learning_rate": 3.6859645043788265e-06,
3083
+ "loss": 2.2959,
3084
+ "step": 210500
3085
+ },
3086
+ {
3087
+ "epoch": 16.35278617375804,
3088
+ "grad_norm": 7.072383403778076,
3089
+ "learning_rate": 3.647213826241959e-06,
3090
+ "loss": 2.2709,
3091
+ "step": 211000
3092
+ },
3093
+ {
3094
+ "epoch": 16.39153685189491,
3095
+ "grad_norm": 6.0719404220581055,
3096
+ "learning_rate": 3.608463148105092e-06,
3097
+ "loss": 2.2952,
3098
+ "step": 211500
3099
+ },
3100
+ {
3101
+ "epoch": 16.430287530031777,
3102
+ "grad_norm": 6.733717441558838,
3103
+ "learning_rate": 3.5697124699682244e-06,
3104
+ "loss": 2.2984,
3105
+ "step": 212000
3106
+ },
3107
+ {
3108
+ "epoch": 16.469038208168644,
3109
+ "grad_norm": 7.269532203674316,
3110
+ "learning_rate": 3.530961791831357e-06,
3111
+ "loss": 2.2855,
3112
+ "step": 212500
3113
+ },
3114
+ {
3115
+ "epoch": 16.50778888630551,
3116
+ "grad_norm": 7.440357208251953,
3117
+ "learning_rate": 3.4922111136944897e-06,
3118
+ "loss": 2.2747,
3119
+ "step": 213000
3120
+ },
3121
+ {
3122
+ "epoch": 16.546539564442377,
3123
+ "grad_norm": 7.448116302490234,
3124
+ "learning_rate": 3.4534604355576223e-06,
3125
+ "loss": 2.2933,
3126
+ "step": 213500
3127
+ },
3128
+ {
3129
+ "epoch": 16.585290242579244,
3130
+ "grad_norm": 6.202878475189209,
3131
+ "learning_rate": 3.414709757420755e-06,
3132
+ "loss": 2.2963,
3133
+ "step": 214000
3134
+ },
3135
+ {
3136
+ "epoch": 16.624040920716112,
3137
+ "grad_norm": 7.019168376922607,
3138
+ "learning_rate": 3.3759590792838876e-06,
3139
+ "loss": 2.2667,
3140
+ "step": 214500
3141
+ },
3142
+ {
3143
+ "epoch": 16.66279159885298,
3144
+ "grad_norm": 6.448665142059326,
3145
+ "learning_rate": 3.3372084011470202e-06,
3146
+ "loss": 2.2905,
3147
+ "step": 215000
3148
+ },
3149
+ {
3150
+ "epoch": 16.701542276989848,
3151
+ "grad_norm": 6.160965442657471,
3152
+ "learning_rate": 3.298457723010153e-06,
3153
+ "loss": 2.2854,
3154
+ "step": 215500
3155
+ },
3156
+ {
3157
+ "epoch": 16.740292955126716,
3158
+ "grad_norm": 6.956637859344482,
3159
+ "learning_rate": 3.2597070448732855e-06,
3160
+ "loss": 2.2944,
3161
+ "step": 216000
3162
+ },
3163
+ {
3164
+ "epoch": 16.779043633263584,
3165
+ "grad_norm": 6.935131549835205,
3166
+ "learning_rate": 3.220956366736418e-06,
3167
+ "loss": 2.2795,
3168
+ "step": 216500
3169
+ },
3170
+ {
3171
+ "epoch": 16.817794311400448,
3172
+ "grad_norm": 6.656859397888184,
3173
+ "learning_rate": 3.1822056885995508e-06,
3174
+ "loss": 2.2872,
3175
+ "step": 217000
3176
+ },
3177
+ {
3178
+ "epoch": 16.856544989537316,
3179
+ "grad_norm": 6.204549312591553,
3180
+ "learning_rate": 3.1434550104626834e-06,
3181
+ "loss": 2.2832,
3182
+ "step": 217500
3183
+ },
3184
+ {
3185
+ "epoch": 16.895295667674183,
3186
+ "grad_norm": 6.77413272857666,
3187
+ "learning_rate": 3.1047043323258156e-06,
3188
+ "loss": 2.2719,
3189
+ "step": 218000
3190
+ },
3191
+ {
3192
+ "epoch": 16.93404634581105,
3193
+ "grad_norm": 6.447382926940918,
3194
+ "learning_rate": 3.0659536541889482e-06,
3195
+ "loss": 2.2702,
3196
+ "step": 218500
3197
+ },
3198
+ {
3199
+ "epoch": 16.97279702394792,
3200
+ "grad_norm": 7.396761894226074,
3201
+ "learning_rate": 3.027202976052081e-06,
3202
+ "loss": 2.2813,
3203
+ "step": 219000
3204
+ },
3205
+ {
3206
+ "epoch": 17.0,
3207
+ "eval_loss": 2.2362165451049805,
3208
+ "eval_runtime": 266.8391,
3209
+ "eval_samples_per_second": 773.743,
3210
+ "eval_steps_per_second": 12.093,
3211
+ "step": 219351
3212
+ },
3213
+ {
3214
+ "epoch": 17.011547702084787,
3215
+ "grad_norm": 6.575385093688965,
3216
+ "learning_rate": 2.9884522979152135e-06,
3217
+ "loss": 2.2747,
3218
+ "step": 219500
3219
+ },
3220
+ {
3221
+ "epoch": 17.050298380221655,
3222
+ "grad_norm": 7.168444633483887,
3223
+ "learning_rate": 2.949701619778346e-06,
3224
+ "loss": 2.2868,
3225
+ "step": 220000
3226
+ },
3227
+ {
3228
+ "epoch": 17.089049058358523,
3229
+ "grad_norm": 7.069167613983154,
3230
+ "learning_rate": 2.9109509416414788e-06,
3231
+ "loss": 2.2838,
3232
+ "step": 220500
3233
+ },
3234
+ {
3235
+ "epoch": 17.12779973649539,
3236
+ "grad_norm": 6.792834758758545,
3237
+ "learning_rate": 2.8722002635046114e-06,
3238
+ "loss": 2.2836,
3239
+ "step": 221000
3240
+ },
3241
+ {
3242
+ "epoch": 17.166550414632255,
3243
+ "grad_norm": 6.546488285064697,
3244
+ "learning_rate": 2.833449585367744e-06,
3245
+ "loss": 2.2733,
3246
+ "step": 221500
3247
+ },
3248
+ {
3249
+ "epoch": 17.205301092769123,
3250
+ "grad_norm": 6.293231010437012,
3251
+ "learning_rate": 2.7946989072308767e-06,
3252
+ "loss": 2.2688,
3253
+ "step": 222000
3254
+ },
3255
+ {
3256
+ "epoch": 17.24405177090599,
3257
+ "grad_norm": 6.560914039611816,
3258
+ "learning_rate": 2.7559482290940093e-06,
3259
+ "loss": 2.2787,
3260
+ "step": 222500
3261
+ },
3262
+ {
3263
+ "epoch": 17.28280244904286,
3264
+ "grad_norm": 6.571765422821045,
3265
+ "learning_rate": 2.717197550957142e-06,
3266
+ "loss": 2.2801,
3267
+ "step": 223000
3268
+ },
3269
+ {
3270
+ "epoch": 17.321553127179726,
3271
+ "grad_norm": 7.396661281585693,
3272
+ "learning_rate": 2.6784468728202746e-06,
3273
+ "loss": 2.28,
3274
+ "step": 223500
3275
+ },
3276
+ {
3277
+ "epoch": 17.360303805316594,
3278
+ "grad_norm": 6.239862442016602,
3279
+ "learning_rate": 2.6396961946834072e-06,
3280
+ "loss": 2.2743,
3281
+ "step": 224000
3282
+ },
3283
+ {
3284
+ "epoch": 17.39905448345346,
3285
+ "grad_norm": 6.766594886779785,
3286
+ "learning_rate": 2.60094551654654e-06,
3287
+ "loss": 2.2456,
3288
+ "step": 224500
3289
+ },
3290
+ {
3291
+ "epoch": 17.43780516159033,
3292
+ "grad_norm": 6.488914966583252,
3293
+ "learning_rate": 2.5621948384096725e-06,
3294
+ "loss": 2.2666,
3295
+ "step": 225000
3296
+ },
3297
+ {
3298
+ "epoch": 17.476555839727194,
3299
+ "grad_norm": 6.036900043487549,
3300
+ "learning_rate": 2.523444160272805e-06,
3301
+ "loss": 2.2577,
3302
+ "step": 225500
3303
+ },
3304
+ {
3305
+ "epoch": 17.51530651786406,
3306
+ "grad_norm": 6.977652549743652,
3307
+ "learning_rate": 2.4846934821359373e-06,
3308
+ "loss": 2.2657,
3309
+ "step": 226000
3310
+ },
3311
+ {
3312
+ "epoch": 17.55405719600093,
3313
+ "grad_norm": 6.468418121337891,
3314
+ "learning_rate": 2.44594280399907e-06,
3315
+ "loss": 2.2737,
3316
+ "step": 226500
3317
+ },
3318
+ {
3319
+ "epoch": 17.592807874137797,
3320
+ "grad_norm": 6.7042646408081055,
3321
+ "learning_rate": 2.4071921258622026e-06,
3322
+ "loss": 2.2685,
3323
+ "step": 227000
3324
+ },
3325
+ {
3326
+ "epoch": 17.631558552274665,
3327
+ "grad_norm": 6.591056823730469,
3328
+ "learning_rate": 2.3684414477253352e-06,
3329
+ "loss": 2.2836,
3330
+ "step": 227500
3331
+ },
3332
+ {
3333
+ "epoch": 17.670309230411533,
3334
+ "grad_norm": 7.078721523284912,
3335
+ "learning_rate": 2.329690769588468e-06,
3336
+ "loss": 2.2754,
3337
+ "step": 228000
3338
+ },
3339
+ {
3340
+ "epoch": 17.7090599085484,
3341
+ "grad_norm": 6.701901435852051,
3342
+ "learning_rate": 2.2909400914516005e-06,
3343
+ "loss": 2.2494,
3344
+ "step": 228500
3345
+ },
3346
+ {
3347
+ "epoch": 17.74781058668527,
3348
+ "grad_norm": 6.622567176818848,
3349
+ "learning_rate": 2.252189413314733e-06,
3350
+ "loss": 2.2689,
3351
+ "step": 229000
3352
+ },
3353
+ {
3354
+ "epoch": 17.786561264822133,
3355
+ "grad_norm": 6.573280334472656,
3356
+ "learning_rate": 2.2134387351778658e-06,
3357
+ "loss": 2.271,
3358
+ "step": 229500
3359
+ },
3360
+ {
3361
+ "epoch": 17.825311942959,
3362
+ "grad_norm": 6.9067206382751465,
3363
+ "learning_rate": 2.1746880570409984e-06,
3364
+ "loss": 2.2573,
3365
+ "step": 230000
3366
+ },
3367
+ {
3368
+ "epoch": 17.86406262109587,
3369
+ "grad_norm": 6.601592063903809,
3370
+ "learning_rate": 2.135937378904131e-06,
3371
+ "loss": 2.2743,
3372
+ "step": 230500
3373
+ },
3374
+ {
3375
+ "epoch": 17.902813299232736,
3376
+ "grad_norm": 6.949497699737549,
3377
+ "learning_rate": 2.0971867007672637e-06,
3378
+ "loss": 2.2644,
3379
+ "step": 231000
3380
+ },
3381
+ {
3382
+ "epoch": 17.941563977369604,
3383
+ "grad_norm": 5.614126205444336,
3384
+ "learning_rate": 2.0584360226303963e-06,
3385
+ "loss": 2.2608,
3386
+ "step": 231500
3387
+ },
3388
+ {
3389
+ "epoch": 17.980314655506472,
3390
+ "grad_norm": 6.880855560302734,
3391
+ "learning_rate": 2.019685344493529e-06,
3392
+ "loss": 2.2862,
3393
+ "step": 232000
3394
+ },
3395
+ {
3396
+ "epoch": 18.0,
3397
+ "eval_loss": 2.229489326477051,
3398
+ "eval_runtime": 270.0811,
3399
+ "eval_samples_per_second": 764.456,
3400
+ "eval_steps_per_second": 11.948,
3401
+ "step": 232254
3402
+ },
3403
+ {
3404
+ "epoch": 18.01906533364334,
3405
+ "grad_norm": 6.630836486816406,
3406
+ "learning_rate": 1.9809346663566616e-06,
3407
+ "loss": 2.2632,
3408
+ "step": 232500
3409
+ },
3410
+ {
3411
+ "epoch": 18.057816011780208,
3412
+ "grad_norm": 6.50869607925415,
3413
+ "learning_rate": 1.942183988219794e-06,
3414
+ "loss": 2.2584,
3415
+ "step": 233000
3416
+ },
3417
+ {
3418
+ "epoch": 18.096566689917072,
3419
+ "grad_norm": 6.81369161605835,
3420
+ "learning_rate": 1.9034333100829266e-06,
3421
+ "loss": 2.2599,
3422
+ "step": 233500
3423
+ },
3424
+ {
3425
+ "epoch": 18.13531736805394,
3426
+ "grad_norm": 6.202197074890137,
3427
+ "learning_rate": 1.8646826319460593e-06,
3428
+ "loss": 2.2532,
3429
+ "step": 234000
3430
+ },
3431
+ {
3432
+ "epoch": 18.174068046190808,
3433
+ "grad_norm": 6.907183647155762,
3434
+ "learning_rate": 1.8259319538091919e-06,
3435
+ "loss": 2.2553,
3436
+ "step": 234500
3437
+ },
3438
+ {
3439
+ "epoch": 18.212818724327676,
3440
+ "grad_norm": 7.445714473724365,
3441
+ "learning_rate": 1.7871812756723245e-06,
3442
+ "loss": 2.2586,
3443
+ "step": 235000
3444
+ },
3445
+ {
3446
+ "epoch": 18.251569402464543,
3447
+ "grad_norm": 6.844184398651123,
3448
+ "learning_rate": 1.7484305975354572e-06,
3449
+ "loss": 2.2502,
3450
+ "step": 235500
3451
+ },
3452
+ {
3453
+ "epoch": 18.29032008060141,
3454
+ "grad_norm": 6.495091438293457,
3455
+ "learning_rate": 1.7096799193985896e-06,
3456
+ "loss": 2.2703,
3457
+ "step": 236000
3458
+ },
3459
+ {
3460
+ "epoch": 18.32907075873828,
3461
+ "grad_norm": 6.848631858825684,
3462
+ "learning_rate": 1.6709292412617222e-06,
3463
+ "loss": 2.2494,
3464
+ "step": 236500
3465
+ },
3466
+ {
3467
+ "epoch": 18.367821436875147,
3468
+ "grad_norm": 6.527080535888672,
3469
+ "learning_rate": 1.6321785631248548e-06,
3470
+ "loss": 2.2676,
3471
+ "step": 237000
3472
+ },
3473
+ {
3474
+ "epoch": 18.40657211501201,
3475
+ "grad_norm": 6.402927875518799,
3476
+ "learning_rate": 1.5934278849879875e-06,
3477
+ "loss": 2.256,
3478
+ "step": 237500
3479
+ },
3480
+ {
3481
+ "epoch": 18.44532279314888,
3482
+ "grad_norm": 6.720060348510742,
3483
+ "learning_rate": 1.5546772068511201e-06,
3484
+ "loss": 2.256,
3485
+ "step": 238000
3486
+ },
3487
+ {
3488
+ "epoch": 18.484073471285747,
3489
+ "grad_norm": 6.392049312591553,
3490
+ "learning_rate": 1.5159265287142528e-06,
3491
+ "loss": 2.272,
3492
+ "step": 238500
3493
+ },
3494
+ {
3495
+ "epoch": 18.522824149422615,
3496
+ "grad_norm": 6.625200271606445,
3497
+ "learning_rate": 1.4771758505773854e-06,
3498
+ "loss": 2.2561,
3499
+ "step": 239000
3500
+ },
3501
+ {
3502
+ "epoch": 18.561574827559483,
3503
+ "grad_norm": 6.451653003692627,
3504
+ "learning_rate": 1.438425172440518e-06,
3505
+ "loss": 2.2518,
3506
+ "step": 239500
3507
+ },
3508
+ {
3509
+ "epoch": 18.60032550569635,
3510
+ "grad_norm": 6.246822357177734,
3511
+ "learning_rate": 1.3996744943036504e-06,
3512
+ "loss": 2.2541,
3513
+ "step": 240000
3514
+ },
3515
+ {
3516
+ "epoch": 18.639076183833218,
3517
+ "grad_norm": 6.265354156494141,
3518
+ "learning_rate": 1.360923816166783e-06,
3519
+ "loss": 2.2546,
3520
+ "step": 240500
3521
+ },
3522
+ {
3523
+ "epoch": 18.677826861970086,
3524
+ "grad_norm": 6.439133644104004,
3525
+ "learning_rate": 1.3221731380299157e-06,
3526
+ "loss": 2.2583,
3527
+ "step": 241000
3528
+ },
3529
+ {
3530
+ "epoch": 18.71657754010695,
3531
+ "grad_norm": 6.528525352478027,
3532
+ "learning_rate": 1.2834224598930483e-06,
3533
+ "loss": 2.2467,
3534
+ "step": 241500
3535
+ },
3536
+ {
3537
+ "epoch": 18.755328218243818,
3538
+ "grad_norm": 7.4315900802612305,
3539
+ "learning_rate": 1.2446717817561808e-06,
3540
+ "loss": 2.2585,
3541
+ "step": 242000
3542
+ },
3543
+ {
3544
+ "epoch": 18.794078896380686,
3545
+ "grad_norm": 7.4202141761779785,
3546
+ "learning_rate": 1.2059211036193134e-06,
3547
+ "loss": 2.2637,
3548
+ "step": 242500
3549
+ },
3550
+ {
3551
+ "epoch": 18.832829574517554,
3552
+ "grad_norm": 6.3204145431518555,
3553
+ "learning_rate": 1.167170425482446e-06,
3554
+ "loss": 2.264,
3555
+ "step": 243000
3556
+ },
3557
+ {
3558
+ "epoch": 18.87158025265442,
3559
+ "grad_norm": 6.220766067504883,
3560
+ "learning_rate": 1.1284197473455787e-06,
3561
+ "loss": 2.2705,
3562
+ "step": 243500
3563
+ },
3564
+ {
3565
+ "epoch": 18.91033093079129,
3566
+ "grad_norm": 6.558001518249512,
3567
+ "learning_rate": 1.0896690692087113e-06,
3568
+ "loss": 2.2632,
3569
+ "step": 244000
3570
+ },
3571
+ {
3572
+ "epoch": 18.949081608928157,
3573
+ "grad_norm": 6.786870956420898,
3574
+ "learning_rate": 1.050918391071844e-06,
3575
+ "loss": 2.2441,
3576
+ "step": 244500
3577
+ },
3578
+ {
3579
+ "epoch": 18.987832287065025,
3580
+ "grad_norm": 6.955057621002197,
3581
+ "learning_rate": 1.0121677129349766e-06,
3582
+ "loss": 2.2503,
3583
+ "step": 245000
3584
+ },
3585
+ {
3586
+ "epoch": 19.0,
3587
+ "eval_loss": 2.2231059074401855,
3588
+ "eval_runtime": 272.187,
3589
+ "eval_samples_per_second": 758.541,
3590
+ "eval_steps_per_second": 11.856,
3591
+ "step": 245157
3592
+ },
3593
+ {
3594
+ "epoch": 19.02658296520189,
3595
+ "grad_norm": 6.136529922485352,
3596
+ "learning_rate": 9.73417034798109e-07,
3597
+ "loss": 2.2528,
3598
+ "step": 245500
3599
+ },
3600
+ {
3601
+ "epoch": 19.065333643338757,
3602
+ "grad_norm": 7.144802093505859,
3603
+ "learning_rate": 9.346663566612417e-07,
3604
+ "loss": 2.248,
3605
+ "step": 246000
3606
+ },
3607
+ {
3608
+ "epoch": 19.104084321475625,
3609
+ "grad_norm": 5.582034111022949,
3610
+ "learning_rate": 8.959156785243743e-07,
3611
+ "loss": 2.2513,
3612
+ "step": 246500
3613
+ },
3614
+ {
3615
+ "epoch": 19.142834999612493,
3616
+ "grad_norm": 6.747804164886475,
3617
+ "learning_rate": 8.571650003875069e-07,
3618
+ "loss": 2.2647,
3619
+ "step": 247000
3620
+ },
3621
+ {
3622
+ "epoch": 19.18158567774936,
3623
+ "grad_norm": 6.1470417976379395,
3624
+ "learning_rate": 8.184143222506395e-07,
3625
+ "loss": 2.2548,
3626
+ "step": 247500
3627
+ },
3628
+ {
3629
+ "epoch": 19.22033635588623,
3630
+ "grad_norm": 6.574125289916992,
3631
+ "learning_rate": 7.79663644113772e-07,
3632
+ "loss": 2.2714,
3633
+ "step": 248000
3634
+ },
3635
+ {
3636
+ "epoch": 19.259087034023096,
3637
+ "grad_norm": 6.6587982177734375,
3638
+ "learning_rate": 7.409129659769046e-07,
3639
+ "loss": 2.2491,
3640
+ "step": 248500
3641
+ },
3642
+ {
3643
+ "epoch": 19.297837712159964,
3644
+ "grad_norm": 6.578282356262207,
3645
+ "learning_rate": 7.021622878400372e-07,
3646
+ "loss": 2.2483,
3647
+ "step": 249000
3648
+ },
3649
+ {
3650
+ "epoch": 19.33658839029683,
3651
+ "grad_norm": 6.449355602264404,
3652
+ "learning_rate": 6.634116097031699e-07,
3653
+ "loss": 2.2558,
3654
+ "step": 249500
3655
+ },
3656
+ {
3657
+ "epoch": 19.375339068433696,
3658
+ "grad_norm": 5.921240329742432,
3659
+ "learning_rate": 6.246609315663025e-07,
3660
+ "loss": 2.2428,
3661
+ "step": 250000
3662
+ },
3663
+ {
3664
+ "epoch": 19.414089746570564,
3665
+ "grad_norm": 6.655218124389648,
3666
+ "learning_rate": 5.859102534294351e-07,
3667
+ "loss": 2.2616,
3668
+ "step": 250500
3669
+ },
3670
+ {
3671
+ "epoch": 19.452840424707432,
3672
+ "grad_norm": 6.733659744262695,
3673
+ "learning_rate": 5.471595752925676e-07,
3674
+ "loss": 2.2481,
3675
+ "step": 251000
3676
+ },
3677
+ {
3678
+ "epoch": 19.4915911028443,
3679
+ "grad_norm": 6.9586968421936035,
3680
+ "learning_rate": 5.084088971557003e-07,
3681
+ "loss": 2.2495,
3682
+ "step": 251500
3683
+ },
3684
+ {
3685
+ "epoch": 19.530341780981168,
3686
+ "grad_norm": 6.441699028015137,
3687
+ "learning_rate": 4.6965821901883286e-07,
3688
+ "loss": 2.2456,
3689
+ "step": 252000
3690
+ },
3691
+ {
3692
+ "epoch": 19.569092459118036,
3693
+ "grad_norm": 6.126708984375,
3694
+ "learning_rate": 4.3090754088196544e-07,
3695
+ "loss": 2.2561,
3696
+ "step": 252500
3697
+ },
3698
+ {
3699
+ "epoch": 19.607843137254903,
3700
+ "grad_norm": 6.69553279876709,
3701
+ "learning_rate": 3.921568627450981e-07,
3702
+ "loss": 2.2435,
3703
+ "step": 253000
3704
+ },
3705
+ {
3706
+ "epoch": 19.646593815391768,
3707
+ "grad_norm": 7.468321800231934,
3708
+ "learning_rate": 3.5340618460823066e-07,
3709
+ "loss": 2.2517,
3710
+ "step": 253500
3711
+ },
3712
+ {
3713
+ "epoch": 19.685344493528635,
3714
+ "grad_norm": 6.724938869476318,
3715
+ "learning_rate": 3.146555064713633e-07,
3716
+ "loss": 2.2307,
3717
+ "step": 254000
3718
+ },
3719
+ {
3720
+ "epoch": 19.724095171665503,
3721
+ "grad_norm": 6.407966613769531,
3722
+ "learning_rate": 2.7590482833449587e-07,
3723
+ "loss": 2.2469,
3724
+ "step": 254500
3725
+ },
3726
+ {
3727
+ "epoch": 19.76284584980237,
3728
+ "grad_norm": 6.521556377410889,
3729
+ "learning_rate": 2.3715415019762845e-07,
3730
+ "loss": 2.2266,
3731
+ "step": 255000
3732
+ },
3733
+ {
3734
+ "epoch": 19.80159652793924,
3735
+ "grad_norm": 6.066943645477295,
3736
+ "learning_rate": 1.9840347206076106e-07,
3737
+ "loss": 2.2448,
3738
+ "step": 255500
3739
+ },
3740
+ {
3741
+ "epoch": 19.840347206076107,
3742
+ "grad_norm": 6.806808948516846,
3743
+ "learning_rate": 1.5965279392389367e-07,
3744
+ "loss": 2.2355,
3745
+ "step": 256000
3746
+ },
3747
+ {
3748
+ "epoch": 19.879097884212975,
3749
+ "grad_norm": 6.567816734313965,
3750
+ "learning_rate": 1.2090211578702627e-07,
3751
+ "loss": 2.2581,
3752
+ "step": 256500
3753
+ },
3754
+ {
3755
+ "epoch": 19.917848562349842,
3756
+ "grad_norm": 7.037693977355957,
3757
+ "learning_rate": 8.21514376501589e-08,
3758
+ "loss": 2.2558,
3759
+ "step": 257000
3760
+ },
3761
+ {
3762
+ "epoch": 19.95659924048671,
3763
+ "grad_norm": 7.10353422164917,
3764
+ "learning_rate": 4.340075951329148e-08,
3765
+ "loss": 2.2513,
3766
+ "step": 257500
3767
+ },
3768
+ {
3769
+ "epoch": 19.995349918623575,
3770
+ "grad_norm": 7.407341003417969,
3771
+ "learning_rate": 4.6500813764240875e-09,
3772
+ "loss": 2.2448,
3773
+ "step": 258000
3774
+ },
3775
+ {
3776
+ "epoch": 20.0,
3777
+ "eval_loss": 2.211674213409424,
3778
+ "eval_runtime": 270.2294,
3779
+ "eval_samples_per_second": 764.036,
3780
+ "eval_steps_per_second": 11.942,
3781
+ "step": 258060
3782
  }
3783
  ],
3784
  "logging_steps": 500,
3785
+ "max_steps": 258060,
3786
  "num_input_tokens_seen": 0,
3787
+ "num_train_epochs": 20,
3788
  "save_steps": 500,
3789
  "stateful_callbacks": {
3790
  "TrainerControl": {
 
3798
  "attributes": {}
3799
  }
3800
  },
3801
+ "total_flos": 6.469828059827256e+17,
3802
  "train_batch_size": 64,
3803
  "trial_name": null,
3804
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c29e483a17b334874faa05ddef49273d268f86505fb1cbbeea2a4cddc04a790
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74ba329b03609527987c8126060f1c2c7e67ac7fad6da2fce4fad8d3324d3a3b
3
  size 5048