Rakhman16 commited on
Commit
f280478
·
verified ·
1 Parent(s): 5c6a24d

Training in progress, step 9000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb836486a08d083b7b894d25da52e76162af179301082404d283271cad95c54d
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c61dc0d23c746a776fb71c6db18e58f3d7ee58c49709af2b68f3cd60b2b05597
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4e1caae54a6748055529a623a21d7278d4621242d7eaa9f6da41a75e89af6e0
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a73653d2a28732f9feb60c8158dce6b6083589210ea76baed95254ba447c70b7
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f0078c995c0a1f7692e135d27dd71fdcbf3affc32f173a64d44dd714e0d5938
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3503e42fc861ccbd3ddfb9b88789bab6a4d3e2eb8ec1f89fd80f3ace0029d6bb
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b19bed9a80268123689bf367dc93ecf09aff888bfe5efc180838ee789e8b295
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdf46ec856f5e9ea116316566a745b60cfe58cd9d7993812727afdb74f6eab30
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.10507776588201523,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-8500",
4
- "epoch": 1.4929305348204092,
5
  "eval_steps": 100,
6
- "global_step": 8500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1877,6 +1877,116 @@
1877
  "eval_samples_per_second": 25.401,
1878
  "eval_steps_per_second": 3.178,
1879
  "step": 8500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1880
  }
1881
  ],
1882
  "logging_steps": 50,
@@ -1896,7 +2006,7 @@
1896
  "attributes": {}
1897
  }
1898
  },
1899
- "total_flos": 4.140700520546304e+16,
1900
  "train_batch_size": 8,
1901
  "trial_name": null,
1902
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.10463293641805649,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-9000",
4
+ "epoch": 1.580749978045139,
5
  "eval_steps": 100,
6
+ "global_step": 9000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1877
  "eval_samples_per_second": 25.401,
1878
  "eval_steps_per_second": 3.178,
1879
  "step": 8500
1880
+ },
1881
+ {
1882
+ "epoch": 1.5017124791428822,
1883
+ "grad_norm": 9354.1494140625,
1884
+ "learning_rate": 1.873616722290532e-05,
1885
+ "loss": 0.1087,
1886
+ "step": 8550
1887
+ },
1888
+ {
1889
+ "epoch": 1.5104944234653552,
1890
+ "grad_norm": 171124.34375,
1891
+ "learning_rate": 1.867029685578781e-05,
1892
+ "loss": 0.1106,
1893
+ "step": 8600
1894
+ },
1895
+ {
1896
+ "epoch": 1.5104944234653552,
1897
+ "eval_loss": 0.10511680692434311,
1898
+ "eval_runtime": 175.6013,
1899
+ "eval_samples_per_second": 25.398,
1900
+ "eval_steps_per_second": 3.178,
1901
+ "step": 8600
1902
+ },
1903
+ {
1904
+ "epoch": 1.5192763677878283,
1905
+ "grad_norm": 10542.4892578125,
1906
+ "learning_rate": 1.8604426488670297e-05,
1907
+ "loss": 0.1042,
1908
+ "step": 8650
1909
+ },
1910
+ {
1911
+ "epoch": 1.5280583121103013,
1912
+ "grad_norm": 8730.197265625,
1913
+ "learning_rate": 1.8538556121552784e-05,
1914
+ "loss": 0.1028,
1915
+ "step": 8700
1916
+ },
1917
+ {
1918
+ "epoch": 1.5280583121103013,
1919
+ "eval_loss": 0.10496073216199875,
1920
+ "eval_runtime": 175.5259,
1921
+ "eval_samples_per_second": 25.409,
1922
+ "eval_steps_per_second": 3.179,
1923
+ "step": 8700
1924
+ },
1925
+ {
1926
+ "epoch": 1.5368402564327743,
1927
+ "grad_norm": 22947.765625,
1928
+ "learning_rate": 1.847268575443527e-05,
1929
+ "loss": 0.1106,
1930
+ "step": 8750
1931
+ },
1932
+ {
1933
+ "epoch": 1.5456222007552474,
1934
+ "grad_norm": 12794.203125,
1935
+ "learning_rate": 1.840681538731776e-05,
1936
+ "loss": 0.1027,
1937
+ "step": 8800
1938
+ },
1939
+ {
1940
+ "epoch": 1.5456222007552474,
1941
+ "eval_loss": 0.10489310324192047,
1942
+ "eval_runtime": 175.3848,
1943
+ "eval_samples_per_second": 25.43,
1944
+ "eval_steps_per_second": 3.182,
1945
+ "step": 8800
1946
+ },
1947
+ {
1948
+ "epoch": 1.5544041450777202,
1949
+ "grad_norm": 9543.232421875,
1950
+ "learning_rate": 1.8340945020200247e-05,
1951
+ "loss": 0.107,
1952
+ "step": 8850
1953
+ },
1954
+ {
1955
+ "epoch": 1.5631860894001932,
1956
+ "grad_norm": 7341.599609375,
1957
+ "learning_rate": 1.8275074653082734e-05,
1958
+ "loss": 0.0986,
1959
+ "step": 8900
1960
+ },
1961
+ {
1962
+ "epoch": 1.5631860894001932,
1963
+ "eval_loss": 0.10493362694978714,
1964
+ "eval_runtime": 175.8527,
1965
+ "eval_samples_per_second": 25.362,
1966
+ "eval_steps_per_second": 3.173,
1967
+ "step": 8900
1968
+ },
1969
+ {
1970
+ "epoch": 1.5719680337226662,
1971
+ "grad_norm": 9900.4501953125,
1972
+ "learning_rate": 1.820920428596522e-05,
1973
+ "loss": 0.1101,
1974
+ "step": 8950
1975
+ },
1976
+ {
1977
+ "epoch": 1.580749978045139,
1978
+ "grad_norm": 9512.732421875,
1979
+ "learning_rate": 1.8143333918847707e-05,
1980
+ "loss": 0.1014,
1981
+ "step": 9000
1982
+ },
1983
+ {
1984
+ "epoch": 1.580749978045139,
1985
+ "eval_loss": 0.10463293641805649,
1986
+ "eval_runtime": 175.3499,
1987
+ "eval_samples_per_second": 25.435,
1988
+ "eval_steps_per_second": 3.182,
1989
+ "step": 9000
1990
  }
1991
  ],
1992
  "logging_steps": 50,
 
2006
  "attributes": {}
2007
  }
2008
  },
2009
+ "total_flos": 4.384283676770304e+16,
2010
  "train_batch_size": 8,
2011
  "trial_name": null,
2012
  "trial_params": null