Weyaxi commited on
Commit
03d9028
1 Parent(s): b35e0cd

Upload folder using huggingface_hub

Browse files
global_step428/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31eb677de80948991321bfe446bcdaf4165d3f8c9efa96cfadccfaf3a07e6432
3
+ size 5435499827
global_step428/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09e4495311047938552b63e0739ae4a551323c70509aed4b806aae92544537a7
3
+ size 5435499827
global_step428/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e1e42c3abfc5e8f3bf1af140fef44ff8f8441a9256840b045d424c01026ce1e
3
+ size 5435499827
global_step428/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:398bfa2896865cf30cef24ec6b68f47976d7f16cb3df673efceb7804d4b93241
3
+ size 5435499827
global_step428/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a62bcb45fde6ec6327735c7a280ac3ce1f417f8924c8774a8c82a17cfbf4c295
3
+ size 5435499827
global_step428/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb1d3d30b3e766144ee852f023bd74030024ab52c9f32771cdef96fb92ea360a
3
+ size 5435499827
global_step428/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:874cb020cb85f973bcb6aabf19b131ea5f4119dfb72f80578d61c90b95b95b8f
3
+ size 5435499827
global_step428/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68f1d3311e2ce3605022c06ddc3064dd9e31f60d59b8cef8e11e6d4ffb6c19c7
3
+ size 5435499827
global_step428/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57e8c2eb8c8299c2c958127742fbf228e53106a6568c4971b8afabbd0109ff90
3
+ size 153829
global_step428/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e282c8dfa4e2bc4ca41f74c0855b60c0b95c8b57349ed0f767b0ac93c6d90e4
3
+ size 153829
global_step428/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a05bcb052dcc69a98918a82611f0e1e96657c85e106f480eda0074811399d0ca
3
+ size 153829
global_step428/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55815a63565a06e023966b6a11030d06cfdc86e64fac8c811f57b00ab0e1e847
3
+ size 153829
global_step428/zero_pp_rank_4_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43ea8ade0f653c47a1ac50680612d8e5218345ab1b86cb35384d76d563dd75cc
3
+ size 153829
global_step428/zero_pp_rank_5_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9650144687908dce67d9631417bd0ee2e129d6fa67ed87825da1ef4167112639
3
+ size 153829
global_step428/zero_pp_rank_6_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fb9e75a82b054c216c566b9165d02270f6da8be51ccc5c3ec334b0c2004d830
3
+ size 153829
global_step428/zero_pp_rank_7_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47f5259391522659ec37fb679a3619fb26f04f6a1908753f5d9f85e2dda46e3f
3
+ size 153829
latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step428
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1bec598899f9d59e70c1b4705ce420a1e0a670957b6c8153a589880068ae5a4
3
+ size 15984
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c60d2348aae518f4c44693db9c9b4b3a3299c556e7f0a86c188b2e4c3e364a7c
3
+ size 15984
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffe5a79d3bcb4ce033de360bc765e616316e3562aba25887cd85c4adbb935abf
3
+ size 15984
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9a9d1f6e22677721841890e6a27855857e6840137650d609eb8e4ac13b71d29
3
+ size 15984
rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcac4ff84388a6a4fe3bcae6207c68b2ee5528fb3b6de8cc3588fe1975462aa5
3
+ size 15984
rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33fce3cdf5c1b8a8a291e0c73b384e3ad5252640e21e942b44b26b8b0928ffa9
3
+ size 15984
rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:919e675f3bcaf4f3c8ba35cd8debf85aec3bbc3c8e5019b74431e0a314e4d37a
3
+ size 15984
rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bf6479ce82b88efc6a72a8ee512162b3d0ecab972817296d38ab9c448bb8d96
3
+ size 15984
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee8c731be5df723d260a08657a32a14cf7657c00593a1a4bb80bc7c00b297026
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,2629 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5059249997138977,
3
+ "best_model_checkpoint": "./Einstein-v3-model/checkpoint-428",
4
+ "epoch": 0.9994162288382954,
5
+ "eval_steps": 107,
6
+ "global_step": 428,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 5.000000000000001e-07,
14
+ "loss": 1.038,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "eval_loss": 1.125040888786316,
20
+ "eval_runtime": 969.9573,
21
+ "eval_samples_per_second": 1.161,
22
+ "eval_steps_per_second": 0.145,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.0,
27
+ "learning_rate": 1.0000000000000002e-06,
28
+ "loss": 1.0692,
29
+ "step": 2
30
+ },
31
+ {
32
+ "epoch": 0.01,
33
+ "learning_rate": 1.5e-06,
34
+ "loss": 1.0161,
35
+ "step": 3
36
+ },
37
+ {
38
+ "epoch": 0.01,
39
+ "learning_rate": 2.0000000000000003e-06,
40
+ "loss": 0.9363,
41
+ "step": 4
42
+ },
43
+ {
44
+ "epoch": 0.01,
45
+ "learning_rate": 2.5e-06,
46
+ "loss": 0.8647,
47
+ "step": 5
48
+ },
49
+ {
50
+ "epoch": 0.01,
51
+ "learning_rate": 3e-06,
52
+ "loss": 0.9888,
53
+ "step": 6
54
+ },
55
+ {
56
+ "epoch": 0.02,
57
+ "learning_rate": 3.5e-06,
58
+ "loss": 0.8645,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.02,
63
+ "learning_rate": 4.000000000000001e-06,
64
+ "loss": 0.763,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.02,
69
+ "learning_rate": 4.5e-06,
70
+ "loss": 0.8058,
71
+ "step": 9
72
+ },
73
+ {
74
+ "epoch": 0.02,
75
+ "learning_rate": 5e-06,
76
+ "loss": 0.789,
77
+ "step": 10
78
+ },
79
+ {
80
+ "epoch": 0.03,
81
+ "learning_rate": 4.9999293917983325e-06,
82
+ "loss": 0.798,
83
+ "step": 11
84
+ },
85
+ {
86
+ "epoch": 0.03,
87
+ "learning_rate": 4.999717571181742e-06,
88
+ "loss": 0.7411,
89
+ "step": 12
90
+ },
91
+ {
92
+ "epoch": 0.03,
93
+ "learning_rate": 4.9993645501152485e-06,
94
+ "loss": 0.748,
95
+ "step": 13
96
+ },
97
+ {
98
+ "epoch": 0.03,
99
+ "learning_rate": 4.998870348539797e-06,
100
+ "loss": 0.7312,
101
+ "step": 14
102
+ },
103
+ {
104
+ "epoch": 0.04,
105
+ "learning_rate": 4.998234994371135e-06,
106
+ "loss": 0.7179,
107
+ "step": 15
108
+ },
109
+ {
110
+ "epoch": 0.04,
111
+ "learning_rate": 4.997458523498236e-06,
112
+ "loss": 0.7158,
113
+ "step": 16
114
+ },
115
+ {
116
+ "epoch": 0.04,
117
+ "learning_rate": 4.996540979781269e-06,
118
+ "loss": 0.6975,
119
+ "step": 17
120
+ },
121
+ {
122
+ "epoch": 0.04,
123
+ "learning_rate": 4.995482415049123e-06,
124
+ "loss": 0.6809,
125
+ "step": 18
126
+ },
127
+ {
128
+ "epoch": 0.04,
129
+ "learning_rate": 4.99428288909648e-06,
130
+ "loss": 0.6709,
131
+ "step": 19
132
+ },
133
+ {
134
+ "epoch": 0.05,
135
+ "learning_rate": 4.992942469680437e-06,
136
+ "loss": 0.6778,
137
+ "step": 20
138
+ },
139
+ {
140
+ "epoch": 0.05,
141
+ "learning_rate": 4.991461232516675e-06,
142
+ "loss": 0.69,
143
+ "step": 21
144
+ },
145
+ {
146
+ "epoch": 0.05,
147
+ "learning_rate": 4.989839261275191e-06,
148
+ "loss": 0.6518,
149
+ "step": 22
150
+ },
151
+ {
152
+ "epoch": 0.05,
153
+ "learning_rate": 4.988076647575562e-06,
154
+ "loss": 0.6712,
155
+ "step": 23
156
+ },
157
+ {
158
+ "epoch": 0.06,
159
+ "learning_rate": 4.986173490981773e-06,
160
+ "loss": 0.6429,
161
+ "step": 24
162
+ },
163
+ {
164
+ "epoch": 0.06,
165
+ "learning_rate": 4.984129898996599e-06,
166
+ "loss": 0.682,
167
+ "step": 25
168
+ },
169
+ {
170
+ "epoch": 0.06,
171
+ "learning_rate": 4.981945987055521e-06,
172
+ "loss": 0.6661,
173
+ "step": 26
174
+ },
175
+ {
176
+ "epoch": 0.06,
177
+ "learning_rate": 4.979621878520217e-06,
178
+ "loss": 0.6287,
179
+ "step": 27
180
+ },
181
+ {
182
+ "epoch": 0.07,
183
+ "learning_rate": 4.977157704671585e-06,
184
+ "loss": 0.6642,
185
+ "step": 28
186
+ },
187
+ {
188
+ "epoch": 0.07,
189
+ "learning_rate": 4.974553604702332e-06,
190
+ "loss": 0.6644,
191
+ "step": 29
192
+ },
193
+ {
194
+ "epoch": 0.07,
195
+ "learning_rate": 4.971809725709112e-06,
196
+ "loss": 0.6453,
197
+ "step": 30
198
+ },
199
+ {
200
+ "epoch": 0.07,
201
+ "learning_rate": 4.968926222684213e-06,
202
+ "loss": 0.6199,
203
+ "step": 31
204
+ },
205
+ {
206
+ "epoch": 0.07,
207
+ "learning_rate": 4.965903258506806e-06,
208
+ "loss": 0.6038,
209
+ "step": 32
210
+ },
211
+ {
212
+ "epoch": 0.08,
213
+ "learning_rate": 4.9627410039337426e-06,
214
+ "loss": 0.6493,
215
+ "step": 33
216
+ },
217
+ {
218
+ "epoch": 0.08,
219
+ "learning_rate": 4.959439637589909e-06,
220
+ "loss": 0.6304,
221
+ "step": 34
222
+ },
223
+ {
224
+ "epoch": 0.08,
225
+ "learning_rate": 4.9559993459581375e-06,
226
+ "loss": 0.6692,
227
+ "step": 35
228
+ },
229
+ {
230
+ "epoch": 0.08,
231
+ "learning_rate": 4.952420323368673e-06,
232
+ "loss": 0.5978,
233
+ "step": 36
234
+ },
235
+ {
236
+ "epoch": 0.09,
237
+ "learning_rate": 4.948702771988195e-06,
238
+ "loss": 0.6318,
239
+ "step": 37
240
+ },
241
+ {
242
+ "epoch": 0.09,
243
+ "learning_rate": 4.944846901808397e-06,
244
+ "loss": 0.5838,
245
+ "step": 38
246
+ },
247
+ {
248
+ "epoch": 0.09,
249
+ "learning_rate": 4.940852930634126e-06,
250
+ "loss": 0.5884,
251
+ "step": 39
252
+ },
253
+ {
254
+ "epoch": 0.09,
255
+ "learning_rate": 4.936721084071079e-06,
256
+ "loss": 0.5955,
257
+ "step": 40
258
+ },
259
+ {
260
+ "epoch": 0.1,
261
+ "learning_rate": 4.932451595513063e-06,
262
+ "loss": 0.5704,
263
+ "step": 41
264
+ },
265
+ {
266
+ "epoch": 0.1,
267
+ "learning_rate": 4.928044706128803e-06,
268
+ "loss": 0.5711,
269
+ "step": 42
270
+ },
271
+ {
272
+ "epoch": 0.1,
273
+ "learning_rate": 4.923500664848327e-06,
274
+ "loss": 0.5886,
275
+ "step": 43
276
+ },
277
+ {
278
+ "epoch": 0.1,
279
+ "learning_rate": 4.918819728348901e-06,
280
+ "loss": 0.5802,
281
+ "step": 44
282
+ },
283
+ {
284
+ "epoch": 0.11,
285
+ "learning_rate": 4.9140021610405335e-06,
286
+ "loss": 0.6221,
287
+ "step": 45
288
+ },
289
+ {
290
+ "epoch": 0.11,
291
+ "learning_rate": 4.909048235051033e-06,
292
+ "loss": 0.5859,
293
+ "step": 46
294
+ },
295
+ {
296
+ "epoch": 0.11,
297
+ "learning_rate": 4.903958230210647e-06,
298
+ "loss": 0.5906,
299
+ "step": 47
300
+ },
301
+ {
302
+ "epoch": 0.11,
303
+ "learning_rate": 4.8987324340362445e-06,
304
+ "loss": 0.586,
305
+ "step": 48
306
+ },
307
+ {
308
+ "epoch": 0.11,
309
+ "learning_rate": 4.89337114171508e-06,
310
+ "loss": 0.5782,
311
+ "step": 49
312
+ },
313
+ {
314
+ "epoch": 0.12,
315
+ "learning_rate": 4.887874656088124e-06,
316
+ "loss": 0.6341,
317
+ "step": 50
318
+ },
319
+ {
320
+ "epoch": 0.12,
321
+ "learning_rate": 4.882243287632947e-06,
322
+ "loss": 0.5753,
323
+ "step": 51
324
+ },
325
+ {
326
+ "epoch": 0.12,
327
+ "learning_rate": 4.8764773544461895e-06,
328
+ "loss": 0.5896,
329
+ "step": 52
330
+ },
331
+ {
332
+ "epoch": 0.12,
333
+ "learning_rate": 4.8705771822255895e-06,
334
+ "loss": 0.567,
335
+ "step": 53
336
+ },
337
+ {
338
+ "epoch": 0.13,
339
+ "learning_rate": 4.864543104251587e-06,
340
+ "loss": 0.5943,
341
+ "step": 54
342
+ },
343
+ {
344
+ "epoch": 0.13,
345
+ "learning_rate": 4.858375461368499e-06,
346
+ "loss": 0.5942,
347
+ "step": 55
348
+ },
349
+ {
350
+ "epoch": 0.13,
351
+ "learning_rate": 4.852074601965261e-06,
352
+ "loss": 0.5639,
353
+ "step": 56
354
+ },
355
+ {
356
+ "epoch": 0.13,
357
+ "learning_rate": 4.845640881955757e-06,
358
+ "loss": 0.6058,
359
+ "step": 57
360
+ },
361
+ {
362
+ "epoch": 0.14,
363
+ "learning_rate": 4.839074664758705e-06,
364
+ "loss": 0.5695,
365
+ "step": 58
366
+ },
367
+ {
368
+ "epoch": 0.14,
369
+ "learning_rate": 4.832376321277136e-06,
370
+ "loss": 0.6233,
371
+ "step": 59
372
+ },
373
+ {
374
+ "epoch": 0.14,
375
+ "learning_rate": 4.825546229877439e-06,
376
+ "loss": 0.5617,
377
+ "step": 60
378
+ },
379
+ {
380
+ "epoch": 0.14,
381
+ "learning_rate": 4.818584776367992e-06,
382
+ "loss": 0.586,
383
+ "step": 61
384
+ },
385
+ {
386
+ "epoch": 0.14,
387
+ "learning_rate": 4.811492353977366e-06,
388
+ "loss": 0.5609,
389
+ "step": 62
390
+ },
391
+ {
392
+ "epoch": 0.15,
393
+ "learning_rate": 4.804269363332112e-06,
394
+ "loss": 0.5371,
395
+ "step": 63
396
+ },
397
+ {
398
+ "epoch": 0.15,
399
+ "learning_rate": 4.7969162124341354e-06,
400
+ "loss": 0.5585,
401
+ "step": 64
402
+ },
403
+ {
404
+ "epoch": 0.15,
405
+ "learning_rate": 4.789433316637644e-06,
406
+ "loss": 0.5548,
407
+ "step": 65
408
+ },
409
+ {
410
+ "epoch": 0.15,
411
+ "learning_rate": 4.781821098625691e-06,
412
+ "loss": 0.5613,
413
+ "step": 66
414
+ },
415
+ {
416
+ "epoch": 0.16,
417
+ "learning_rate": 4.7740799883862966e-06,
418
+ "loss": 0.5483,
419
+ "step": 67
420
+ },
421
+ {
422
+ "epoch": 0.16,
423
+ "learning_rate": 4.766210423188158e-06,
424
+ "loss": 0.5892,
425
+ "step": 68
426
+ },
427
+ {
428
+ "epoch": 0.16,
429
+ "learning_rate": 4.758212847555953e-06,
430
+ "loss": 0.5813,
431
+ "step": 69
432
+ },
433
+ {
434
+ "epoch": 0.16,
435
+ "learning_rate": 4.750087713245227e-06,
436
+ "loss": 0.552,
437
+ "step": 70
438
+ },
439
+ {
440
+ "epoch": 0.17,
441
+ "learning_rate": 4.74183547921688e-06,
442
+ "loss": 0.5478,
443
+ "step": 71
444
+ },
445
+ {
446
+ "epoch": 0.17,
447
+ "learning_rate": 4.733456611611233e-06,
448
+ "loss": 0.5648,
449
+ "step": 72
450
+ },
451
+ {
452
+ "epoch": 0.17,
453
+ "learning_rate": 4.7249515837217075e-06,
454
+ "loss": 0.5717,
455
+ "step": 73
456
+ },
457
+ {
458
+ "epoch": 0.17,
459
+ "learning_rate": 4.716320875968081e-06,
460
+ "loss": 0.5916,
461
+ "step": 74
462
+ },
463
+ {
464
+ "epoch": 0.18,
465
+ "learning_rate": 4.707564975869357e-06,
466
+ "loss": 0.5562,
467
+ "step": 75
468
+ },
469
+ {
470
+ "epoch": 0.18,
471
+ "learning_rate": 4.698684378016223e-06,
472
+ "loss": 0.5347,
473
+ "step": 76
474
+ },
475
+ {
476
+ "epoch": 0.18,
477
+ "learning_rate": 4.6896795840431155e-06,
478
+ "loss": 0.5595,
479
+ "step": 77
480
+ },
481
+ {
482
+ "epoch": 0.18,
483
+ "learning_rate": 4.680551102599881e-06,
484
+ "loss": 0.564,
485
+ "step": 78
486
+ },
487
+ {
488
+ "epoch": 0.18,
489
+ "learning_rate": 4.671299449323045e-06,
490
+ "loss": 0.5646,
491
+ "step": 79
492
+ },
493
+ {
494
+ "epoch": 0.19,
495
+ "learning_rate": 4.66192514680669e-06,
496
+ "loss": 0.5667,
497
+ "step": 80
498
+ },
499
+ {
500
+ "epoch": 0.19,
501
+ "learning_rate": 4.652428724572929e-06,
502
+ "loss": 0.5726,
503
+ "step": 81
504
+ },
505
+ {
506
+ "epoch": 0.19,
507
+ "learning_rate": 4.642810719041999e-06,
508
+ "loss": 0.544,
509
+ "step": 82
510
+ },
511
+ {
512
+ "epoch": 0.19,
513
+ "learning_rate": 4.63307167350196e-06,
514
+ "loss": 0.5673,
515
+ "step": 83
516
+ },
517
+ {
518
+ "epoch": 0.2,
519
+ "learning_rate": 4.623212138078004e-06,
520
+ "loss": 0.5743,
521
+ "step": 84
522
+ },
523
+ {
524
+ "epoch": 0.2,
525
+ "learning_rate": 4.613232669701384e-06,
526
+ "loss": 0.5605,
527
+ "step": 85
528
+ },
529
+ {
530
+ "epoch": 0.2,
531
+ "learning_rate": 4.603133832077953e-06,
532
+ "loss": 0.5754,
533
+ "step": 86
534
+ },
535
+ {
536
+ "epoch": 0.2,
537
+ "learning_rate": 4.592916195656322e-06,
538
+ "loss": 0.5578,
539
+ "step": 87
540
+ },
541
+ {
542
+ "epoch": 0.21,
543
+ "learning_rate": 4.582580337595636e-06,
544
+ "loss": 0.5261,
545
+ "step": 88
546
+ },
547
+ {
548
+ "epoch": 0.21,
549
+ "learning_rate": 4.572126841732977e-06,
550
+ "loss": 0.5864,
551
+ "step": 89
552
+ },
553
+ {
554
+ "epoch": 0.21,
555
+ "learning_rate": 4.561556298550379e-06,
556
+ "loss": 0.5388,
557
+ "step": 90
558
+ },
559
+ {
560
+ "epoch": 0.21,
561
+ "learning_rate": 4.550869305141478e-06,
562
+ "loss": 0.5662,
563
+ "step": 91
564
+ },
565
+ {
566
+ "epoch": 0.21,
567
+ "learning_rate": 4.5400664651777835e-06,
568
+ "loss": 0.5359,
569
+ "step": 92
570
+ },
571
+ {
572
+ "epoch": 0.22,
573
+ "learning_rate": 4.529148388874577e-06,
574
+ "loss": 0.5485,
575
+ "step": 93
576
+ },
577
+ {
578
+ "epoch": 0.22,
579
+ "learning_rate": 4.518115692956445e-06,
580
+ "loss": 0.5715,
581
+ "step": 94
582
+ },
583
+ {
584
+ "epoch": 0.22,
585
+ "learning_rate": 4.506969000622443e-06,
586
+ "loss": 0.5412,
587
+ "step": 95
588
+ },
589
+ {
590
+ "epoch": 0.22,
591
+ "learning_rate": 4.49570894151089e-06,
592
+ "loss": 0.5475,
593
+ "step": 96
594
+ },
595
+ {
596
+ "epoch": 0.23,
597
+ "learning_rate": 4.484336151663807e-06,
598
+ "loss": 0.5443,
599
+ "step": 97
600
+ },
601
+ {
602
+ "epoch": 0.23,
603
+ "learning_rate": 4.472851273490985e-06,
604
+ "loss": 0.5519,
605
+ "step": 98
606
+ },
607
+ {
608
+ "epoch": 0.23,
609
+ "learning_rate": 4.4612549557336975e-06,
610
+ "loss": 0.5293,
611
+ "step": 99
612
+ },
613
+ {
614
+ "epoch": 0.23,
615
+ "learning_rate": 4.449547853428061e-06,
616
+ "loss": 0.5545,
617
+ "step": 100
618
+ },
619
+ {
620
+ "epoch": 0.24,
621
+ "learning_rate": 4.437730627868028e-06,
622
+ "loss": 0.5361,
623
+ "step": 101
624
+ },
625
+ {
626
+ "epoch": 0.24,
627
+ "learning_rate": 4.425803946568033e-06,
628
+ "loss": 0.547,
629
+ "step": 102
630
+ },
631
+ {
632
+ "epoch": 0.24,
633
+ "learning_rate": 4.413768483225292e-06,
634
+ "loss": 0.532,
635
+ "step": 103
636
+ },
637
+ {
638
+ "epoch": 0.24,
639
+ "learning_rate": 4.401624917681743e-06,
640
+ "loss": 0.5515,
641
+ "step": 104
642
+ },
643
+ {
644
+ "epoch": 0.25,
645
+ "learning_rate": 4.3893739358856465e-06,
646
+ "loss": 0.5424,
647
+ "step": 105
648
+ },
649
+ {
650
+ "epoch": 0.25,
651
+ "learning_rate": 4.377016229852836e-06,
652
+ "loss": 0.5217,
653
+ "step": 106
654
+ },
655
+ {
656
+ "epoch": 0.25,
657
+ "learning_rate": 4.364552497627632e-06,
658
+ "loss": 0.5254,
659
+ "step": 107
660
+ },
661
+ {
662
+ "epoch": 0.25,
663
+ "eval_loss": 0.5753679871559143,
664
+ "eval_runtime": 972.8304,
665
+ "eval_samples_per_second": 1.157,
666
+ "eval_steps_per_second": 0.145,
667
+ "step": 107
668
+ },
669
+ {
670
+ "epoch": 0.25,
671
+ "learning_rate": 4.3519834432434095e-06,
672
+ "loss": 0.5599,
673
+ "step": 108
674
+ },
675
+ {
676
+ "epoch": 0.25,
677
+ "learning_rate": 4.33930977668283e-06,
678
+ "loss": 0.5479,
679
+ "step": 109
680
+ },
681
+ {
682
+ "epoch": 0.26,
683
+ "learning_rate": 4.326532213837735e-06,
684
+ "loss": 0.582,
685
+ "step": 110
686
+ },
687
+ {
688
+ "epoch": 0.26,
689
+ "learning_rate": 4.3136514764687155e-06,
690
+ "loss": 0.5545,
691
+ "step": 111
692
+ },
693
+ {
694
+ "epoch": 0.26,
695
+ "learning_rate": 4.300668292164329e-06,
696
+ "loss": 0.5487,
697
+ "step": 112
698
+ },
699
+ {
700
+ "epoch": 0.26,
701
+ "learning_rate": 4.287583394300016e-06,
702
+ "loss": 0.5644,
703
+ "step": 113
704
+ },
705
+ {
706
+ "epoch": 0.27,
707
+ "learning_rate": 4.274397521996658e-06,
708
+ "loss": 0.5358,
709
+ "step": 114
710
+ },
711
+ {
712
+ "epoch": 0.27,
713
+ "learning_rate": 4.261111420078844e-06,
714
+ "loss": 0.5425,
715
+ "step": 115
716
+ },
717
+ {
718
+ "epoch": 0.27,
719
+ "learning_rate": 4.247725839032781e-06,
720
+ "loss": 0.5211,
721
+ "step": 116
722
+ },
723
+ {
724
+ "epoch": 0.27,
725
+ "learning_rate": 4.234241534963916e-06,
726
+ "loss": 0.5663,
727
+ "step": 117
728
+ },
729
+ {
730
+ "epoch": 0.28,
731
+ "learning_rate": 4.220659269554217e-06,
732
+ "loss": 0.5382,
733
+ "step": 118
734
+ },
735
+ {
736
+ "epoch": 0.28,
737
+ "learning_rate": 4.206979810019153e-06,
738
+ "loss": 0.554,
739
+ "step": 119
740
+ },
741
+ {
742
+ "epoch": 0.28,
743
+ "learning_rate": 4.1932039290643534e-06,
744
+ "loss": 0.528,
745
+ "step": 120
746
+ },
747
+ {
748
+ "epoch": 0.28,
749
+ "learning_rate": 4.179332404841963e-06,
750
+ "loss": 0.5146,
751
+ "step": 121
752
+ },
753
+ {
754
+ "epoch": 0.28,
755
+ "learning_rate": 4.1653660209066835e-06,
756
+ "loss": 0.5608,
757
+ "step": 122
758
+ },
759
+ {
760
+ "epoch": 0.29,
761
+ "learning_rate": 4.151305566171521e-06,
762
+ "loss": 0.5573,
763
+ "step": 123
764
+ },
765
+ {
766
+ "epoch": 0.29,
767
+ "learning_rate": 4.137151834863213e-06,
768
+ "loss": 0.5345,
769
+ "step": 124
770
+ },
771
+ {
772
+ "epoch": 0.29,
773
+ "learning_rate": 4.122905626477371e-06,
774
+ "loss": 0.5434,
775
+ "step": 125
776
+ },
777
+ {
778
+ "epoch": 0.29,
779
+ "learning_rate": 4.108567745733318e-06,
780
+ "loss": 0.5685,
781
+ "step": 126
782
+ },
783
+ {
784
+ "epoch": 0.3,
785
+ "learning_rate": 4.094139002528635e-06,
786
+ "loss": 0.5442,
787
+ "step": 127
788
+ },
789
+ {
790
+ "epoch": 0.3,
791
+ "learning_rate": 4.07962021189341e-06,
792
+ "loss": 0.4957,
793
+ "step": 128
794
+ },
795
+ {
796
+ "epoch": 0.3,
797
+ "learning_rate": 4.065012193944201e-06,
798
+ "loss": 0.5833,
799
+ "step": 129
800
+ },
801
+ {
802
+ "epoch": 0.3,
803
+ "learning_rate": 4.050315773837708e-06,
804
+ "loss": 0.5178,
805
+ "step": 130
806
+ },
807
+ {
808
+ "epoch": 0.31,
809
+ "learning_rate": 4.0355317817241705e-06,
810
+ "loss": 0.553,
811
+ "step": 131
812
+ },
813
+ {
814
+ "epoch": 0.31,
815
+ "learning_rate": 4.020661052700462e-06,
816
+ "loss": 0.5466,
817
+ "step": 132
818
+ },
819
+ {
820
+ "epoch": 0.31,
821
+ "learning_rate": 4.00570442676293e-06,
822
+ "loss": 0.5412,
823
+ "step": 133
824
+ },
825
+ {
826
+ "epoch": 0.31,
827
+ "learning_rate": 3.990662748759946e-06,
828
+ "loss": 0.5573,
829
+ "step": 134
830
+ },
831
+ {
832
+ "epoch": 0.32,
833
+ "learning_rate": 3.975536868344174e-06,
834
+ "loss": 0.5593,
835
+ "step": 135
836
+ },
837
+ {
838
+ "epoch": 0.32,
839
+ "learning_rate": 3.9603276399245864e-06,
840
+ "loss": 0.5251,
841
+ "step": 136
842
+ },
843
+ {
844
+ "epoch": 0.32,
845
+ "learning_rate": 3.945035922618198e-06,
846
+ "loss": 0.5341,
847
+ "step": 137
848
+ },
849
+ {
850
+ "epoch": 0.32,
851
+ "learning_rate": 3.929662580201536e-06,
852
+ "loss": 0.5267,
853
+ "step": 138
854
+ },
855
+ {
856
+ "epoch": 0.32,
857
+ "learning_rate": 3.91420848106185e-06,
858
+ "loss": 0.5161,
859
+ "step": 139
860
+ },
861
+ {
862
+ "epoch": 0.33,
863
+ "learning_rate": 3.898674498148058e-06,
864
+ "loss": 0.5403,
865
+ "step": 140
866
+ },
867
+ {
868
+ "epoch": 0.33,
869
+ "learning_rate": 3.883061508921439e-06,
870
+ "loss": 0.536,
871
+ "step": 141
872
+ },
873
+ {
874
+ "epoch": 0.33,
875
+ "learning_rate": 3.8673703953060685e-06,
876
+ "loss": 0.5374,
877
+ "step": 142
878
+ },
879
+ {
880
+ "epoch": 0.33,
881
+ "learning_rate": 3.8516020436389945e-06,
882
+ "loss": 0.4947,
883
+ "step": 143
884
+ },
885
+ {
886
+ "epoch": 0.34,
887
+ "learning_rate": 3.835757344620183e-06,
888
+ "loss": 0.5328,
889
+ "step": 144
890
+ },
891
+ {
892
+ "epoch": 0.34,
893
+ "learning_rate": 3.819837193262197e-06,
894
+ "loss": 0.5306,
895
+ "step": 145
896
+ },
897
+ {
898
+ "epoch": 0.34,
899
+ "learning_rate": 3.803842488839642e-06,
900
+ "loss": 0.5334,
901
+ "step": 146
902
+ },
903
+ {
904
+ "epoch": 0.34,
905
+ "learning_rate": 3.7877741348383703e-06,
906
+ "loss": 0.5238,
907
+ "step": 147
908
+ },
909
+ {
910
+ "epoch": 0.35,
911
+ "learning_rate": 3.7716330389044463e-06,
912
+ "loss": 0.5396,
913
+ "step": 148
914
+ },
915
+ {
916
+ "epoch": 0.35,
917
+ "learning_rate": 3.7554201127928747e-06,
918
+ "loss": 0.5121,
919
+ "step": 149
920
+ },
921
+ {
922
+ "epoch": 0.35,
923
+ "learning_rate": 3.739136272316102e-06,
924
+ "loss": 0.5662,
925
+ "step": 150
926
+ },
927
+ {
928
+ "epoch": 0.35,
929
+ "learning_rate": 3.72278243729228e-06,
930
+ "loss": 0.4901,
931
+ "step": 151
932
+ },
933
+ {
934
+ "epoch": 0.35,
935
+ "learning_rate": 3.706359531493316e-06,
936
+ "loss": 0.4966,
937
+ "step": 152
938
+ },
939
+ {
940
+ "epoch": 0.36,
941
+ "learning_rate": 3.6898684825926845e-06,
942
+ "loss": 0.5597,
943
+ "step": 153
944
+ },
945
+ {
946
+ "epoch": 0.36,
947
+ "learning_rate": 3.6733102221130303e-06,
948
+ "loss": 0.5164,
949
+ "step": 154
950
+ },
951
+ {
952
+ "epoch": 0.36,
953
+ "learning_rate": 3.656685685373552e-06,
954
+ "loss": 0.5555,
955
+ "step": 155
956
+ },
957
+ {
958
+ "epoch": 0.36,
959
+ "learning_rate": 3.6399958114371597e-06,
960
+ "loss": 0.5309,
961
+ "step": 156
962
+ },
963
+ {
964
+ "epoch": 0.37,
965
+ "learning_rate": 3.623241543057445e-06,
966
+ "loss": 0.5468,
967
+ "step": 157
968
+ },
969
+ {
970
+ "epoch": 0.37,
971
+ "learning_rate": 3.606423826625414e-06,
972
+ "loss": 0.5183,
973
+ "step": 158
974
+ },
975
+ {
976
+ "epoch": 0.37,
977
+ "learning_rate": 3.5895436121160388e-06,
978
+ "loss": 0.5361,
979
+ "step": 159
980
+ },
981
+ {
982
+ "epoch": 0.37,
983
+ "learning_rate": 3.5726018530345913e-06,
984
+ "loss": 0.5343,
985
+ "step": 160
986
+ },
987
+ {
988
+ "epoch": 0.38,
989
+ "learning_rate": 3.5555995063627842e-06,
990
+ "loss": 0.5548,
991
+ "step": 161
992
+ },
993
+ {
994
+ "epoch": 0.38,
995
+ "learning_rate": 3.5385375325047167e-06,
996
+ "loss": 0.5728,
997
+ "step": 162
998
+ },
999
+ {
1000
+ "epoch": 0.38,
1001
+ "learning_rate": 3.5214168952326205e-06,
1002
+ "loss": 0.5234,
1003
+ "step": 163
1004
+ },
1005
+ {
1006
+ "epoch": 0.38,
1007
+ "learning_rate": 3.5042385616324243e-06,
1008
+ "loss": 0.5047,
1009
+ "step": 164
1010
+ },
1011
+ {
1012
+ "epoch": 0.39,
1013
+ "learning_rate": 3.4870035020491216e-06,
1014
+ "loss": 0.5179,
1015
+ "step": 165
1016
+ },
1017
+ {
1018
+ "epoch": 0.39,
1019
+ "learning_rate": 3.469712690031962e-06,
1020
+ "loss": 0.5013,
1021
+ "step": 166
1022
+ },
1023
+ {
1024
+ "epoch": 0.39,
1025
+ "learning_rate": 3.4523671022794612e-06,
1026
+ "loss": 0.5113,
1027
+ "step": 167
1028
+ },
1029
+ {
1030
+ "epoch": 0.39,
1031
+ "learning_rate": 3.4349677185842246e-06,
1032
+ "loss": 0.5694,
1033
+ "step": 168
1034
+ },
1035
+ {
1036
+ "epoch": 0.39,
1037
+ "learning_rate": 3.4175155217776057e-06,
1038
+ "loss": 0.4965,
1039
+ "step": 169
1040
+ },
1041
+ {
1042
+ "epoch": 0.4,
1043
+ "learning_rate": 3.4000114976741905e-06,
1044
+ "loss": 0.5475,
1045
+ "step": 170
1046
+ },
1047
+ {
1048
+ "epoch": 0.4,
1049
+ "learning_rate": 3.38245663501611e-06,
1050
+ "loss": 0.5014,
1051
+ "step": 171
1052
+ },
1053
+ {
1054
+ "epoch": 0.4,
1055
+ "learning_rate": 3.3648519254171906e-06,
1056
+ "loss": 0.5079,
1057
+ "step": 172
1058
+ },
1059
+ {
1060
+ "epoch": 0.4,
1061
+ "learning_rate": 3.3471983633069414e-06,
1062
+ "loss": 0.5222,
1063
+ "step": 173
1064
+ },
1065
+ {
1066
+ "epoch": 0.41,
1067
+ "learning_rate": 3.32949694587438e-06,
1068
+ "loss": 0.5039,
1069
+ "step": 174
1070
+ },
1071
+ {
1072
+ "epoch": 0.41,
1073
+ "learning_rate": 3.3117486730117092e-06,
1074
+ "loss": 0.5067,
1075
+ "step": 175
1076
+ },
1077
+ {
1078
+ "epoch": 0.41,
1079
+ "learning_rate": 3.2939545472578314e-06,
1080
+ "loss": 0.5575,
1081
+ "step": 176
1082
+ },
1083
+ {
1084
+ "epoch": 0.41,
1085
+ "learning_rate": 3.276115573741724e-06,
1086
+ "loss": 0.5155,
1087
+ "step": 177
1088
+ },
1089
+ {
1090
+ "epoch": 0.42,
1091
+ "learning_rate": 3.2582327601256567e-06,
1092
+ "loss": 0.4915,
1093
+ "step": 178
1094
+ },
1095
+ {
1096
+ "epoch": 0.42,
1097
+ "learning_rate": 3.240307116548279e-06,
1098
+ "loss": 0.5415,
1099
+ "step": 179
1100
+ },
1101
+ {
1102
+ "epoch": 0.42,
1103
+ "learning_rate": 3.222339655567556e-06,
1104
+ "loss": 0.5326,
1105
+ "step": 180
1106
+ },
1107
+ {
1108
+ "epoch": 0.42,
1109
+ "learning_rate": 3.2043313921035747e-06,
1110
+ "loss": 0.5921,
1111
+ "step": 181
1112
+ },
1113
+ {
1114
+ "epoch": 0.42,
1115
+ "learning_rate": 3.1862833433812137e-06,
1116
+ "loss": 0.5318,
1117
+ "step": 182
1118
+ },
1119
+ {
1120
+ "epoch": 0.43,
1121
+ "learning_rate": 3.1681965288726825e-06,
1122
+ "loss": 0.5151,
1123
+ "step": 183
1124
+ },
1125
+ {
1126
+ "epoch": 0.43,
1127
+ "learning_rate": 3.1500719702399406e-06,
1128
+ "loss": 0.5281,
1129
+ "step": 184
1130
+ },
1131
+ {
1132
+ "epoch": 0.43,
1133
+ "learning_rate": 3.1319106912769797e-06,
1134
+ "loss": 0.5046,
1135
+ "step": 185
1136
+ },
1137
+ {
1138
+ "epoch": 0.43,
1139
+ "learning_rate": 3.1137137178519983e-06,
1140
+ "loss": 0.5022,
1141
+ "step": 186
1142
+ },
1143
+ {
1144
+ "epoch": 0.44,
1145
+ "learning_rate": 3.0954820778494516e-06,
1146
+ "loss": 0.5835,
1147
+ "step": 187
1148
+ },
1149
+ {
1150
+ "epoch": 0.44,
1151
+ "learning_rate": 3.0772168011119894e-06,
1152
+ "loss": 0.514,
1153
+ "step": 188
1154
+ },
1155
+ {
1156
+ "epoch": 0.44,
1157
+ "learning_rate": 3.0589189193822894e-06,
1158
+ "loss": 0.5291,
1159
+ "step": 189
1160
+ },
1161
+ {
1162
+ "epoch": 0.44,
1163
+ "learning_rate": 3.0405894662447682e-06,
1164
+ "loss": 0.5186,
1165
+ "step": 190
1166
+ },
1167
+ {
1168
+ "epoch": 0.45,
1169
+ "learning_rate": 3.0222294770672054e-06,
1170
+ "loss": 0.5483,
1171
+ "step": 191
1172
+ },
1173
+ {
1174
+ "epoch": 0.45,
1175
+ "learning_rate": 3.0038399889422553e-06,
1176
+ "loss": 0.5561,
1177
+ "step": 192
1178
+ },
1179
+ {
1180
+ "epoch": 0.45,
1181
+ "learning_rate": 2.985422040628867e-06,
1182
+ "loss": 0.5307,
1183
+ "step": 193
1184
+ },
1185
+ {
1186
+ "epoch": 0.45,
1187
+ "learning_rate": 2.9669766724936074e-06,
1188
+ "loss": 0.5137,
1189
+ "step": 194
1190
+ },
1191
+ {
1192
+ "epoch": 0.46,
1193
+ "learning_rate": 2.948504926451896e-06,
1194
+ "loss": 0.5222,
1195
+ "step": 195
1196
+ },
1197
+ {
1198
+ "epoch": 0.46,
1199
+ "learning_rate": 2.930007845909146e-06,
1200
+ "loss": 0.5254,
1201
+ "step": 196
1202
+ },
1203
+ {
1204
+ "epoch": 0.46,
1205
+ "learning_rate": 2.911486475701835e-06,
1206
+ "loss": 0.5369,
1207
+ "step": 197
1208
+ },
1209
+ {
1210
+ "epoch": 0.46,
1211
+ "learning_rate": 2.892941862038475e-06,
1212
+ "loss": 0.5371,
1213
+ "step": 198
1214
+ },
1215
+ {
1216
+ "epoch": 0.46,
1217
+ "learning_rate": 2.8743750524405254e-06,
1218
+ "loss": 0.5285,
1219
+ "step": 199
1220
+ },
1221
+ {
1222
+ "epoch": 0.47,
1223
+ "learning_rate": 2.8557870956832135e-06,
1224
+ "loss": 0.5319,
1225
+ "step": 200
1226
+ },
1227
+ {
1228
+ "epoch": 0.47,
1229
+ "learning_rate": 2.837179041736299e-06,
1230
+ "loss": 0.4983,
1231
+ "step": 201
1232
+ },
1233
+ {
1234
+ "epoch": 0.47,
1235
+ "learning_rate": 2.8185519417047624e-06,
1236
+ "loss": 0.4962,
1237
+ "step": 202
1238
+ },
1239
+ {
1240
+ "epoch": 0.47,
1241
+ "learning_rate": 2.799906847769433e-06,
1242
+ "loss": 0.5055,
1243
+ "step": 203
1244
+ },
1245
+ {
1246
+ "epoch": 0.48,
1247
+ "learning_rate": 2.781244813127552e-06,
1248
+ "loss": 0.4918,
1249
+ "step": 204
1250
+ },
1251
+ {
1252
+ "epoch": 0.48,
1253
+ "learning_rate": 2.762566891933285e-06,
1254
+ "loss": 0.5191,
1255
+ "step": 205
1256
+ },
1257
+ {
1258
+ "epoch": 0.48,
1259
+ "learning_rate": 2.743874139238171e-06,
1260
+ "loss": 0.5509,
1261
+ "step": 206
1262
+ },
1263
+ {
1264
+ "epoch": 0.48,
1265
+ "learning_rate": 2.725167610931534e-06,
1266
+ "loss": 0.5296,
1267
+ "step": 207
1268
+ },
1269
+ {
1270
+ "epoch": 0.49,
1271
+ "learning_rate": 2.7064483636808314e-06,
1272
+ "loss": 0.5335,
1273
+ "step": 208
1274
+ },
1275
+ {
1276
+ "epoch": 0.49,
1277
+ "learning_rate": 2.687717454871971e-06,
1278
+ "loss": 0.4982,
1279
+ "step": 209
1280
+ },
1281
+ {
1282
+ "epoch": 0.49,
1283
+ "learning_rate": 2.6689759425495833e-06,
1284
+ "loss": 0.4864,
1285
+ "step": 210
1286
+ },
1287
+ {
1288
+ "epoch": 0.49,
1289
+ "learning_rate": 2.650224885357251e-06,
1290
+ "loss": 0.5334,
1291
+ "step": 211
1292
+ },
1293
+ {
1294
+ "epoch": 0.5,
1295
+ "learning_rate": 2.6314653424777194e-06,
1296
+ "loss": 0.5309,
1297
+ "step": 212
1298
+ },
1299
+ {
1300
+ "epoch": 0.5,
1301
+ "learning_rate": 2.612698373573056e-06,
1302
+ "loss": 0.5346,
1303
+ "step": 213
1304
+ },
1305
+ {
1306
+ "epoch": 0.5,
1307
+ "learning_rate": 2.593925038724802e-06,
1308
+ "loss": 0.5144,
1309
+ "step": 214
1310
+ },
1311
+ {
1312
+ "epoch": 0.5,
1313
+ "eval_loss": 0.5360019207000732,
1314
+ "eval_runtime": 973.5712,
1315
+ "eval_samples_per_second": 1.157,
1316
+ "eval_steps_per_second": 0.145,
1317
+ "step": 214
1318
+ },
1319
+ {
1320
+ "epoch": 0.5,
1321
+ "learning_rate": 2.575146398374087e-06,
1322
+ "loss": 0.5263,
1323
+ "step": 215
1324
+ },
1325
+ {
1326
+ "epoch": 0.5,
1327
+ "learning_rate": 2.5563635132617305e-06,
1328
+ "loss": 0.5135,
1329
+ "step": 216
1330
+ },
1331
+ {
1332
+ "epoch": 0.51,
1333
+ "learning_rate": 2.5375774443683263e-06,
1334
+ "loss": 0.5003,
1335
+ "step": 217
1336
+ },
1337
+ {
1338
+ "epoch": 0.51,
1339
+ "learning_rate": 2.518789252854305e-06,
1340
+ "loss": 0.5005,
1341
+ "step": 218
1342
+ },
1343
+ {
1344
+ "epoch": 0.51,
1345
+ "learning_rate": 2.5e-06,
1346
+ "loss": 0.5006,
1347
+ "step": 219
1348
+ },
1349
+ {
1350
+ "epoch": 0.51,
1351
+ "learning_rate": 2.4812107471456958e-06,
1352
+ "loss": 0.5052,
1353
+ "step": 220
1354
+ },
1355
+ {
1356
+ "epoch": 0.52,
1357
+ "learning_rate": 2.4624225556316745e-06,
1358
+ "loss": 0.4966,
1359
+ "step": 221
1360
+ },
1361
+ {
1362
+ "epoch": 0.52,
1363
+ "learning_rate": 2.44363648673827e-06,
1364
+ "loss": 0.4997,
1365
+ "step": 222
1366
+ },
1367
+ {
1368
+ "epoch": 0.52,
1369
+ "learning_rate": 2.4248536016259137e-06,
1370
+ "loss": 0.535,
1371
+ "step": 223
1372
+ },
1373
+ {
1374
+ "epoch": 0.52,
1375
+ "learning_rate": 2.4060749612751987e-06,
1376
+ "loss": 0.5156,
1377
+ "step": 224
1378
+ },
1379
+ {
1380
+ "epoch": 0.53,
1381
+ "learning_rate": 2.3873016264269446e-06,
1382
+ "loss": 0.5505,
1383
+ "step": 225
1384
+ },
1385
+ {
1386
+ "epoch": 0.53,
1387
+ "learning_rate": 2.368534657522281e-06,
1388
+ "loss": 0.4973,
1389
+ "step": 226
1390
+ },
1391
+ {
1392
+ "epoch": 0.53,
1393
+ "learning_rate": 2.3497751146427494e-06,
1394
+ "loss": 0.5186,
1395
+ "step": 227
1396
+ },
1397
+ {
1398
+ "epoch": 0.53,
1399
+ "learning_rate": 2.3310240574504184e-06,
1400
+ "loss": 0.5199,
1401
+ "step": 228
1402
+ },
1403
+ {
1404
+ "epoch": 0.53,
1405
+ "learning_rate": 2.3122825451280294e-06,
1406
+ "loss": 0.546,
1407
+ "step": 229
1408
+ },
1409
+ {
1410
+ "epoch": 0.54,
1411
+ "learning_rate": 2.2935516363191695e-06,
1412
+ "loss": 0.4939,
1413
+ "step": 230
1414
+ },
1415
+ {
1416
+ "epoch": 0.54,
1417
+ "learning_rate": 2.2748323890684664e-06,
1418
+ "loss": 0.5012,
1419
+ "step": 231
1420
+ },
1421
+ {
1422
+ "epoch": 0.54,
1423
+ "learning_rate": 2.2561258607618296e-06,
1424
+ "loss": 0.4843,
1425
+ "step": 232
1426
+ },
1427
+ {
1428
+ "epoch": 0.54,
1429
+ "learning_rate": 2.2374331080667168e-06,
1430
+ "loss": 0.5267,
1431
+ "step": 233
1432
+ },
1433
+ {
1434
+ "epoch": 0.55,
1435
+ "learning_rate": 2.2187551868724487e-06,
1436
+ "loss": 0.4927,
1437
+ "step": 234
1438
+ },
1439
+ {
1440
+ "epoch": 0.55,
1441
+ "learning_rate": 2.200093152230568e-06,
1442
+ "loss": 0.4968,
1443
+ "step": 235
1444
+ },
1445
+ {
1446
+ "epoch": 0.55,
1447
+ "learning_rate": 2.1814480582952376e-06,
1448
+ "loss": 0.4787,
1449
+ "step": 236
1450
+ },
1451
+ {
1452
+ "epoch": 0.55,
1453
+ "learning_rate": 2.1628209582637024e-06,
1454
+ "loss": 0.4645,
1455
+ "step": 237
1456
+ },
1457
+ {
1458
+ "epoch": 0.56,
1459
+ "learning_rate": 2.1442129043167877e-06,
1460
+ "loss": 0.5348,
1461
+ "step": 238
1462
+ },
1463
+ {
1464
+ "epoch": 0.56,
1465
+ "learning_rate": 2.125624947559475e-06,
1466
+ "loss": 0.5095,
1467
+ "step": 239
1468
+ },
1469
+ {
1470
+ "epoch": 0.56,
1471
+ "learning_rate": 2.1070581379615253e-06,
1472
+ "loss": 0.5282,
1473
+ "step": 240
1474
+ },
1475
+ {
1476
+ "epoch": 0.56,
1477
+ "learning_rate": 2.088513524298165e-06,
1478
+ "loss": 0.5046,
1479
+ "step": 241
1480
+ },
1481
+ {
1482
+ "epoch": 0.57,
1483
+ "learning_rate": 2.0699921540908542e-06,
1484
+ "loss": 0.5325,
1485
+ "step": 242
1486
+ },
1487
+ {
1488
+ "epoch": 0.57,
1489
+ "learning_rate": 2.0514950735481053e-06,
1490
+ "loss": 0.5104,
1491
+ "step": 243
1492
+ },
1493
+ {
1494
+ "epoch": 0.57,
1495
+ "learning_rate": 2.033023327506393e-06,
1496
+ "loss": 0.4881,
1497
+ "step": 244
1498
+ },
1499
+ {
1500
+ "epoch": 0.57,
1501
+ "learning_rate": 2.014577959371134e-06,
1502
+ "loss": 0.4938,
1503
+ "step": 245
1504
+ },
1505
+ {
1506
+ "epoch": 0.57,
1507
+ "learning_rate": 1.996160011057746e-06,
1508
+ "loss": 0.4977,
1509
+ "step": 246
1510
+ },
1511
+ {
1512
+ "epoch": 0.58,
1513
+ "learning_rate": 1.9777705229327954e-06,
1514
+ "loss": 0.4936,
1515
+ "step": 247
1516
+ },
1517
+ {
1518
+ "epoch": 0.58,
1519
+ "learning_rate": 1.959410533755232e-06,
1520
+ "loss": 0.5176,
1521
+ "step": 248
1522
+ },
1523
+ {
1524
+ "epoch": 0.58,
1525
+ "learning_rate": 1.9410810806177105e-06,
1526
+ "loss": 0.5057,
1527
+ "step": 249
1528
+ },
1529
+ {
1530
+ "epoch": 0.58,
1531
+ "learning_rate": 1.922783198888011e-06,
1532
+ "loss": 0.4982,
1533
+ "step": 250
1534
+ },
1535
+ {
1536
+ "epoch": 0.59,
1537
+ "learning_rate": 1.9045179221505497e-06,
1538
+ "loss": 0.5128,
1539
+ "step": 251
1540
+ },
1541
+ {
1542
+ "epoch": 0.59,
1543
+ "learning_rate": 1.8862862821480023e-06,
1544
+ "loss": 0.5186,
1545
+ "step": 252
1546
+ },
1547
+ {
1548
+ "epoch": 0.59,
1549
+ "learning_rate": 1.8680893087230207e-06,
1550
+ "loss": 0.5112,
1551
+ "step": 253
1552
+ },
1553
+ {
1554
+ "epoch": 0.59,
1555
+ "learning_rate": 1.8499280297600594e-06,
1556
+ "loss": 0.4928,
1557
+ "step": 254
1558
+ },
1559
+ {
1560
+ "epoch": 0.6,
1561
+ "learning_rate": 1.8318034711273181e-06,
1562
+ "loss": 0.5209,
1563
+ "step": 255
1564
+ },
1565
+ {
1566
+ "epoch": 0.6,
1567
+ "learning_rate": 1.813716656618788e-06,
1568
+ "loss": 0.5073,
1569
+ "step": 256
1570
+ },
1571
+ {
1572
+ "epoch": 0.6,
1573
+ "learning_rate": 1.7956686078964257e-06,
1574
+ "loss": 0.4724,
1575
+ "step": 257
1576
+ },
1577
+ {
1578
+ "epoch": 0.6,
1579
+ "learning_rate": 1.7776603444324445e-06,
1580
+ "loss": 0.51,
1581
+ "step": 258
1582
+ },
1583
+ {
1584
+ "epoch": 0.6,
1585
+ "learning_rate": 1.759692883451721e-06,
1586
+ "loss": 0.5193,
1587
+ "step": 259
1588
+ },
1589
+ {
1590
+ "epoch": 0.61,
1591
+ "learning_rate": 1.741767239874344e-06,
1592
+ "loss": 0.519,
1593
+ "step": 260
1594
+ },
1595
+ {
1596
+ "epoch": 0.61,
1597
+ "learning_rate": 1.723884426258277e-06,
1598
+ "loss": 0.5073,
1599
+ "step": 261
1600
+ },
1601
+ {
1602
+ "epoch": 0.61,
1603
+ "learning_rate": 1.7060454527421688e-06,
1604
+ "loss": 0.5178,
1605
+ "step": 262
1606
+ },
1607
+ {
1608
+ "epoch": 0.61,
1609
+ "learning_rate": 1.6882513269882916e-06,
1610
+ "loss": 0.5006,
1611
+ "step": 263
1612
+ },
1613
+ {
1614
+ "epoch": 0.62,
1615
+ "learning_rate": 1.6705030541256211e-06,
1616
+ "loss": 0.4955,
1617
+ "step": 264
1618
+ },
1619
+ {
1620
+ "epoch": 0.62,
1621
+ "learning_rate": 1.6528016366930594e-06,
1622
+ "loss": 0.5231,
1623
+ "step": 265
1624
+ },
1625
+ {
1626
+ "epoch": 0.62,
1627
+ "learning_rate": 1.6351480745828098e-06,
1628
+ "loss": 0.4647,
1629
+ "step": 266
1630
+ },
1631
+ {
1632
+ "epoch": 0.62,
1633
+ "learning_rate": 1.6175433649838901e-06,
1634
+ "loss": 0.493,
1635
+ "step": 267
1636
+ },
1637
+ {
1638
+ "epoch": 0.63,
1639
+ "learning_rate": 1.5999885023258099e-06,
1640
+ "loss": 0.4896,
1641
+ "step": 268
1642
+ },
1643
+ {
1644
+ "epoch": 0.63,
1645
+ "learning_rate": 1.5824844782223956e-06,
1646
+ "loss": 0.516,
1647
+ "step": 269
1648
+ },
1649
+ {
1650
+ "epoch": 0.63,
1651
+ "learning_rate": 1.5650322814157764e-06,
1652
+ "loss": 0.5059,
1653
+ "step": 270
1654
+ },
1655
+ {
1656
+ "epoch": 0.63,
1657
+ "learning_rate": 1.5476328977205396e-06,
1658
+ "loss": 0.5017,
1659
+ "step": 271
1660
+ },
1661
+ {
1662
+ "epoch": 0.64,
1663
+ "learning_rate": 1.5302873099680378e-06,
1664
+ "loss": 0.4947,
1665
+ "step": 272
1666
+ },
1667
+ {
1668
+ "epoch": 0.64,
1669
+ "learning_rate": 1.5129964979508792e-06,
1670
+ "loss": 0.4764,
1671
+ "step": 273
1672
+ },
1673
+ {
1674
+ "epoch": 0.64,
1675
+ "learning_rate": 1.495761438367577e-06,
1676
+ "loss": 0.5008,
1677
+ "step": 274
1678
+ },
1679
+ {
1680
+ "epoch": 0.64,
1681
+ "learning_rate": 1.47858310476738e-06,
1682
+ "loss": 0.5402,
1683
+ "step": 275
1684
+ },
1685
+ {
1686
+ "epoch": 0.64,
1687
+ "learning_rate": 1.4614624674952843e-06,
1688
+ "loss": 0.4882,
1689
+ "step": 276
1690
+ },
1691
+ {
1692
+ "epoch": 0.65,
1693
+ "learning_rate": 1.4444004936372166e-06,
1694
+ "loss": 0.4946,
1695
+ "step": 277
1696
+ },
1697
+ {
1698
+ "epoch": 0.65,
1699
+ "learning_rate": 1.4273981469654093e-06,
1700
+ "loss": 0.5305,
1701
+ "step": 278
1702
+ },
1703
+ {
1704
+ "epoch": 0.65,
1705
+ "learning_rate": 1.4104563878839623e-06,
1706
+ "loss": 0.4951,
1707
+ "step": 279
1708
+ },
1709
+ {
1710
+ "epoch": 0.65,
1711
+ "learning_rate": 1.3935761733745865e-06,
1712
+ "loss": 0.5188,
1713
+ "step": 280
1714
+ },
1715
+ {
1716
+ "epoch": 0.66,
1717
+ "learning_rate": 1.3767584569425562e-06,
1718
+ "loss": 0.4943,
1719
+ "step": 281
1720
+ },
1721
+ {
1722
+ "epoch": 0.66,
1723
+ "learning_rate": 1.360004188562841e-06,
1724
+ "loss": 0.485,
1725
+ "step": 282
1726
+ },
1727
+ {
1728
+ "epoch": 0.66,
1729
+ "learning_rate": 1.3433143146264494e-06,
1730
+ "loss": 0.5002,
1731
+ "step": 283
1732
+ },
1733
+ {
1734
+ "epoch": 0.66,
1735
+ "learning_rate": 1.3266897778869704e-06,
1736
+ "loss": 0.5005,
1737
+ "step": 284
1738
+ },
1739
+ {
1740
+ "epoch": 0.67,
1741
+ "learning_rate": 1.3101315174073162e-06,
1742
+ "loss": 0.513,
1743
+ "step": 285
1744
+ },
1745
+ {
1746
+ "epoch": 0.67,
1747
+ "learning_rate": 1.2936404685066852e-06,
1748
+ "loss": 0.5159,
1749
+ "step": 286
1750
+ },
1751
+ {
1752
+ "epoch": 0.67,
1753
+ "learning_rate": 1.2772175627077204e-06,
1754
+ "loss": 0.5532,
1755
+ "step": 287
1756
+ },
1757
+ {
1758
+ "epoch": 0.67,
1759
+ "learning_rate": 1.2608637276838987e-06,
1760
+ "loss": 0.4815,
1761
+ "step": 288
1762
+ },
1763
+ {
1764
+ "epoch": 0.67,
1765
+ "learning_rate": 1.244579887207126e-06,
1766
+ "loss": 0.4783,
1767
+ "step": 289
1768
+ },
1769
+ {
1770
+ "epoch": 0.68,
1771
+ "learning_rate": 1.2283669610955543e-06,
1772
+ "loss": 0.4875,
1773
+ "step": 290
1774
+ },
1775
+ {
1776
+ "epoch": 0.68,
1777
+ "learning_rate": 1.2122258651616305e-06,
1778
+ "loss": 0.5021,
1779
+ "step": 291
1780
+ },
1781
+ {
1782
+ "epoch": 0.68,
1783
+ "learning_rate": 1.1961575111603588e-06,
1784
+ "loss": 0.4948,
1785
+ "step": 292
1786
+ },
1787
+ {
1788
+ "epoch": 0.68,
1789
+ "learning_rate": 1.1801628067378033e-06,
1790
+ "loss": 0.4622,
1791
+ "step": 293
1792
+ },
1793
+ {
1794
+ "epoch": 0.69,
1795
+ "learning_rate": 1.1642426553798175e-06,
1796
+ "loss": 0.5352,
1797
+ "step": 294
1798
+ },
1799
+ {
1800
+ "epoch": 0.69,
1801
+ "learning_rate": 1.148397956361007e-06,
1802
+ "loss": 0.6051,
1803
+ "step": 295
1804
+ },
1805
+ {
1806
+ "epoch": 0.69,
1807
+ "learning_rate": 1.1326296046939334e-06,
1808
+ "loss": 0.506,
1809
+ "step": 296
1810
+ },
1811
+ {
1812
+ "epoch": 0.69,
1813
+ "learning_rate": 1.1169384910785613e-06,
1814
+ "loss": 0.5109,
1815
+ "step": 297
1816
+ },
1817
+ {
1818
+ "epoch": 0.7,
1819
+ "learning_rate": 1.1013255018519426e-06,
1820
+ "loss": 0.5157,
1821
+ "step": 298
1822
+ },
1823
+ {
1824
+ "epoch": 0.7,
1825
+ "learning_rate": 1.0857915189381512e-06,
1826
+ "loss": 0.5131,
1827
+ "step": 299
1828
+ },
1829
+ {
1830
+ "epoch": 0.7,
1831
+ "learning_rate": 1.0703374197984654e-06,
1832
+ "loss": 0.5295,
1833
+ "step": 300
1834
+ },
1835
+ {
1836
+ "epoch": 0.7,
1837
+ "learning_rate": 1.054964077381803e-06,
1838
+ "loss": 0.4669,
1839
+ "step": 301
1840
+ },
1841
+ {
1842
+ "epoch": 0.71,
1843
+ "learning_rate": 1.0396723600754144e-06,
1844
+ "loss": 0.4831,
1845
+ "step": 302
1846
+ },
1847
+ {
1848
+ "epoch": 0.71,
1849
+ "learning_rate": 1.0244631316558268e-06,
1850
+ "loss": 0.5104,
1851
+ "step": 303
1852
+ },
1853
+ {
1854
+ "epoch": 0.71,
1855
+ "learning_rate": 1.009337251240055e-06,
1856
+ "loss": 0.4822,
1857
+ "step": 304
1858
+ },
1859
+ {
1860
+ "epoch": 0.71,
1861
+ "learning_rate": 9.942955732370706e-07,
1862
+ "loss": 0.4747,
1863
+ "step": 305
1864
+ },
1865
+ {
1866
+ "epoch": 0.71,
1867
+ "learning_rate": 9.793389472995393e-07,
1868
+ "loss": 0.4954,
1869
+ "step": 306
1870
+ },
1871
+ {
1872
+ "epoch": 0.72,
1873
+ "learning_rate": 9.644682182758305e-07,
1874
+ "loss": 0.4992,
1875
+ "step": 307
1876
+ },
1877
+ {
1878
+ "epoch": 0.72,
1879
+ "learning_rate": 9.496842261622921e-07,
1880
+ "loss": 0.4863,
1881
+ "step": 308
1882
+ },
1883
+ {
1884
+ "epoch": 0.72,
1885
+ "learning_rate": 9.349878060557998e-07,
1886
+ "loss": 0.4711,
1887
+ "step": 309
1888
+ },
1889
+ {
1890
+ "epoch": 0.72,
1891
+ "learning_rate": 9.203797881065907e-07,
1892
+ "loss": 0.54,
1893
+ "step": 310
1894
+ },
1895
+ {
1896
+ "epoch": 0.73,
1897
+ "learning_rate": 9.058609974713655e-07,
1898
+ "loss": 0.5112,
1899
+ "step": 311
1900
+ },
1901
+ {
1902
+ "epoch": 0.73,
1903
+ "learning_rate": 8.914322542666822e-07,
1904
+ "loss": 0.4862,
1905
+ "step": 312
1906
+ },
1907
+ {
1908
+ "epoch": 0.73,
1909
+ "learning_rate": 8.770943735226303e-07,
1910
+ "loss": 0.4967,
1911
+ "step": 313
1912
+ },
1913
+ {
1914
+ "epoch": 0.73,
1915
+ "learning_rate": 8.628481651367876e-07,
1916
+ "loss": 0.4836,
1917
+ "step": 314
1918
+ },
1919
+ {
1920
+ "epoch": 0.74,
1921
+ "learning_rate": 8.486944338284797e-07,
1922
+ "loss": 0.4816,
1923
+ "step": 315
1924
+ },
1925
+ {
1926
+ "epoch": 0.74,
1927
+ "learning_rate": 8.346339790933167e-07,
1928
+ "loss": 0.4775,
1929
+ "step": 316
1930
+ },
1931
+ {
1932
+ "epoch": 0.74,
1933
+ "learning_rate": 8.206675951580382e-07,
1934
+ "loss": 0.4966,
1935
+ "step": 317
1936
+ },
1937
+ {
1938
+ "epoch": 0.74,
1939
+ "learning_rate": 8.067960709356479e-07,
1940
+ "loss": 0.4697,
1941
+ "step": 318
1942
+ },
1943
+ {
1944
+ "epoch": 0.74,
1945
+ "learning_rate": 7.930201899808476e-07,
1946
+ "loss": 0.4939,
1947
+ "step": 319
1948
+ },
1949
+ {
1950
+ "epoch": 0.75,
1951
+ "learning_rate": 7.793407304457836e-07,
1952
+ "loss": 0.4889,
1953
+ "step": 320
1954
+ },
1955
+ {
1956
+ "epoch": 0.75,
1957
+ "learning_rate": 7.657584650360847e-07,
1958
+ "loss": 0.483,
1959
+ "step": 321
1960
+ },
1961
+ {
1962
+ "epoch": 0.75,
1963
+ "eval_loss": 0.5117549896240234,
1964
+ "eval_runtime": 974.2092,
1965
+ "eval_samples_per_second": 1.156,
1966
+ "eval_steps_per_second": 0.145,
1967
+ "step": 321
1968
+ },
1969
+ {
1970
+ "epoch": 0.75,
1971
+ "learning_rate": 7.522741609672194e-07,
1972
+ "loss": 0.4917,
1973
+ "step": 322
1974
+ },
1975
+ {
1976
+ "epoch": 0.75,
1977
+ "learning_rate": 7.388885799211573e-07,
1978
+ "loss": 0.5108,
1979
+ "step": 323
1980
+ },
1981
+ {
1982
+ "epoch": 0.76,
1983
+ "learning_rate": 7.256024780033418e-07,
1984
+ "loss": 0.4503,
1985
+ "step": 324
1986
+ },
1987
+ {
1988
+ "epoch": 0.76,
1989
+ "learning_rate": 7.124166056999854e-07,
1990
+ "loss": 0.4906,
1991
+ "step": 325
1992
+ },
1993
+ {
1994
+ "epoch": 0.76,
1995
+ "learning_rate": 6.993317078356709e-07,
1996
+ "loss": 0.492,
1997
+ "step": 326
1998
+ },
1999
+ {
2000
+ "epoch": 0.76,
2001
+ "learning_rate": 6.863485235312853e-07,
2002
+ "loss": 0.4756,
2003
+ "step": 327
2004
+ },
2005
+ {
2006
+ "epoch": 0.77,
2007
+ "learning_rate": 6.734677861622652e-07,
2008
+ "loss": 0.5299,
2009
+ "step": 328
2010
+ },
2011
+ {
2012
+ "epoch": 0.77,
2013
+ "learning_rate": 6.60690223317171e-07,
2014
+ "loss": 0.4963,
2015
+ "step": 329
2016
+ },
2017
+ {
2018
+ "epoch": 0.77,
2019
+ "learning_rate": 6.480165567565913e-07,
2020
+ "loss": 0.5079,
2021
+ "step": 330
2022
+ },
2023
+ {
2024
+ "epoch": 0.77,
2025
+ "learning_rate": 6.354475023723685e-07,
2026
+ "loss": 0.4878,
2027
+ "step": 331
2028
+ },
2029
+ {
2030
+ "epoch": 0.78,
2031
+ "learning_rate": 6.229837701471645e-07,
2032
+ "loss": 0.4631,
2033
+ "step": 332
2034
+ },
2035
+ {
2036
+ "epoch": 0.78,
2037
+ "learning_rate": 6.106260641143547e-07,
2038
+ "loss": 0.4811,
2039
+ "step": 333
2040
+ },
2041
+ {
2042
+ "epoch": 0.78,
2043
+ "learning_rate": 5.983750823182574e-07,
2044
+ "loss": 0.4797,
2045
+ "step": 334
2046
+ },
2047
+ {
2048
+ "epoch": 0.78,
2049
+ "learning_rate": 5.86231516774709e-07,
2050
+ "loss": 0.5034,
2051
+ "step": 335
2052
+ },
2053
+ {
2054
+ "epoch": 0.78,
2055
+ "learning_rate": 5.741960534319677e-07,
2056
+ "loss": 0.5047,
2057
+ "step": 336
2058
+ },
2059
+ {
2060
+ "epoch": 0.79,
2061
+ "learning_rate": 5.622693721319728e-07,
2062
+ "loss": 0.508,
2063
+ "step": 337
2064
+ },
2065
+ {
2066
+ "epoch": 0.79,
2067
+ "learning_rate": 5.504521465719392e-07,
2068
+ "loss": 0.4828,
2069
+ "step": 338
2070
+ },
2071
+ {
2072
+ "epoch": 0.79,
2073
+ "learning_rate": 5.387450442663026e-07,
2074
+ "loss": 0.4878,
2075
+ "step": 339
2076
+ },
2077
+ {
2078
+ "epoch": 0.79,
2079
+ "learning_rate": 5.271487265090163e-07,
2080
+ "loss": 0.4994,
2081
+ "step": 340
2082
+ },
2083
+ {
2084
+ "epoch": 0.8,
2085
+ "learning_rate": 5.156638483361933e-07,
2086
+ "loss": 0.5069,
2087
+ "step": 341
2088
+ },
2089
+ {
2090
+ "epoch": 0.8,
2091
+ "learning_rate": 5.0429105848911e-07,
2092
+ "loss": 0.4739,
2093
+ "step": 342
2094
+ },
2095
+ {
2096
+ "epoch": 0.8,
2097
+ "learning_rate": 4.930309993775578e-07,
2098
+ "loss": 0.4773,
2099
+ "step": 343
2100
+ },
2101
+ {
2102
+ "epoch": 0.8,
2103
+ "learning_rate": 4.818843070435561e-07,
2104
+ "loss": 0.4791,
2105
+ "step": 344
2106
+ },
2107
+ {
2108
+ "epoch": 0.81,
2109
+ "learning_rate": 4.708516111254238e-07,
2110
+ "loss": 0.4662,
2111
+ "step": 345
2112
+ },
2113
+ {
2114
+ "epoch": 0.81,
2115
+ "learning_rate": 4.5993353482221697e-07,
2116
+ "loss": 0.4834,
2117
+ "step": 346
2118
+ },
2119
+ {
2120
+ "epoch": 0.81,
2121
+ "learning_rate": 4.4913069485852197e-07,
2122
+ "loss": 0.5012,
2123
+ "step": 347
2124
+ },
2125
+ {
2126
+ "epoch": 0.81,
2127
+ "learning_rate": 4.3844370144962153e-07,
2128
+ "loss": 0.492,
2129
+ "step": 348
2130
+ },
2131
+ {
2132
+ "epoch": 0.81,
2133
+ "learning_rate": 4.2787315826702396e-07,
2134
+ "loss": 0.468,
2135
+ "step": 349
2136
+ },
2137
+ {
2138
+ "epoch": 0.82,
2139
+ "learning_rate": 4.1741966240436446e-07,
2140
+ "loss": 0.485,
2141
+ "step": 350
2142
+ },
2143
+ {
2144
+ "epoch": 0.82,
2145
+ "learning_rate": 4.070838043436787e-07,
2146
+ "loss": 0.5009,
2147
+ "step": 351
2148
+ },
2149
+ {
2150
+ "epoch": 0.82,
2151
+ "learning_rate": 3.9686616792204677e-07,
2152
+ "loss": 0.4994,
2153
+ "step": 352
2154
+ },
2155
+ {
2156
+ "epoch": 0.82,
2157
+ "learning_rate": 3.867673302986161e-07,
2158
+ "loss": 0.4665,
2159
+ "step": 353
2160
+ },
2161
+ {
2162
+ "epoch": 0.83,
2163
+ "learning_rate": 3.7678786192199695e-07,
2164
+ "loss": 0.482,
2165
+ "step": 354
2166
+ },
2167
+ {
2168
+ "epoch": 0.83,
2169
+ "learning_rate": 3.6692832649804085e-07,
2170
+ "loss": 0.4914,
2171
+ "step": 355
2172
+ },
2173
+ {
2174
+ "epoch": 0.83,
2175
+ "learning_rate": 3.571892809580013e-07,
2176
+ "loss": 0.5156,
2177
+ "step": 356
2178
+ },
2179
+ {
2180
+ "epoch": 0.83,
2181
+ "learning_rate": 3.475712754270716e-07,
2182
+ "loss": 0.5109,
2183
+ "step": 357
2184
+ },
2185
+ {
2186
+ "epoch": 0.84,
2187
+ "learning_rate": 3.3807485319331037e-07,
2188
+ "loss": 0.4865,
2189
+ "step": 358
2190
+ },
2191
+ {
2192
+ "epoch": 0.84,
2193
+ "learning_rate": 3.2870055067695557e-07,
2194
+ "loss": 0.4672,
2195
+ "step": 359
2196
+ },
2197
+ {
2198
+ "epoch": 0.84,
2199
+ "learning_rate": 3.194488974001203e-07,
2200
+ "loss": 0.5019,
2201
+ "step": 360
2202
+ },
2203
+ {
2204
+ "epoch": 0.84,
2205
+ "learning_rate": 3.1032041595688514e-07,
2206
+ "loss": 0.4891,
2207
+ "step": 361
2208
+ },
2209
+ {
2210
+ "epoch": 0.85,
2211
+ "learning_rate": 3.0131562198377763e-07,
2212
+ "loss": 0.4944,
2213
+ "step": 362
2214
+ },
2215
+ {
2216
+ "epoch": 0.85,
2217
+ "learning_rate": 2.9243502413064365e-07,
2218
+ "loss": 0.4971,
2219
+ "step": 363
2220
+ },
2221
+ {
2222
+ "epoch": 0.85,
2223
+ "learning_rate": 2.8367912403191976e-07,
2224
+ "loss": 0.4814,
2225
+ "step": 364
2226
+ },
2227
+ {
2228
+ "epoch": 0.85,
2229
+ "learning_rate": 2.7504841627829293e-07,
2230
+ "loss": 0.4853,
2231
+ "step": 365
2232
+ },
2233
+ {
2234
+ "epoch": 0.85,
2235
+ "learning_rate": 2.6654338838876664e-07,
2236
+ "loss": 0.4849,
2237
+ "step": 366
2238
+ },
2239
+ {
2240
+ "epoch": 0.86,
2241
+ "learning_rate": 2.581645207831204e-07,
2242
+ "loss": 0.4495,
2243
+ "step": 367
2244
+ },
2245
+ {
2246
+ "epoch": 0.86,
2247
+ "learning_rate": 2.4991228675477293e-07,
2248
+ "loss": 0.5105,
2249
+ "step": 368
2250
+ },
2251
+ {
2252
+ "epoch": 0.86,
2253
+ "learning_rate": 2.4178715244404796e-07,
2254
+ "loss": 0.4874,
2255
+ "step": 369
2256
+ },
2257
+ {
2258
+ "epoch": 0.86,
2259
+ "learning_rate": 2.3378957681184283e-07,
2260
+ "loss": 0.4764,
2261
+ "step": 370
2262
+ },
2263
+ {
2264
+ "epoch": 0.87,
2265
+ "learning_rate": 2.2592001161370392e-07,
2266
+ "loss": 0.516,
2267
+ "step": 371
2268
+ },
2269
+ {
2270
+ "epoch": 0.87,
2271
+ "learning_rate": 2.1817890137430936e-07,
2272
+ "loss": 0.5093,
2273
+ "step": 372
2274
+ },
2275
+ {
2276
+ "epoch": 0.87,
2277
+ "learning_rate": 2.1056668336235624e-07,
2278
+ "loss": 0.484,
2279
+ "step": 373
2280
+ },
2281
+ {
2282
+ "epoch": 0.87,
2283
+ "learning_rate": 2.0308378756586562e-07,
2284
+ "loss": 0.4886,
2285
+ "step": 374
2286
+ },
2287
+ {
2288
+ "epoch": 0.88,
2289
+ "learning_rate": 1.9573063666788878e-07,
2290
+ "loss": 0.5288,
2291
+ "step": 375
2292
+ },
2293
+ {
2294
+ "epoch": 0.88,
2295
+ "learning_rate": 1.8850764602263428e-07,
2296
+ "loss": 0.5072,
2297
+ "step": 376
2298
+ },
2299
+ {
2300
+ "epoch": 0.88,
2301
+ "learning_rate": 1.8141522363200797e-07,
2302
+ "loss": 0.4875,
2303
+ "step": 377
2304
+ },
2305
+ {
2306
+ "epoch": 0.88,
2307
+ "learning_rate": 1.7445377012256127e-07,
2308
+ "loss": 0.4842,
2309
+ "step": 378
2310
+ },
2311
+ {
2312
+ "epoch": 0.88,
2313
+ "learning_rate": 1.676236787228652e-07,
2314
+ "loss": 0.4777,
2315
+ "step": 379
2316
+ },
2317
+ {
2318
+ "epoch": 0.89,
2319
+ "learning_rate": 1.6092533524129623e-07,
2320
+ "loss": 0.4904,
2321
+ "step": 380
2322
+ },
2323
+ {
2324
+ "epoch": 0.89,
2325
+ "learning_rate": 1.543591180442436e-07,
2326
+ "loss": 0.4872,
2327
+ "step": 381
2328
+ },
2329
+ {
2330
+ "epoch": 0.89,
2331
+ "learning_rate": 1.4792539803473921e-07,
2332
+ "loss": 0.4778,
2333
+ "step": 382
2334
+ },
2335
+ {
2336
+ "epoch": 0.89,
2337
+ "learning_rate": 1.4162453863150183e-07,
2338
+ "loss": 0.4944,
2339
+ "step": 383
2340
+ },
2341
+ {
2342
+ "epoch": 0.9,
2343
+ "learning_rate": 1.3545689574841341e-07,
2344
+ "loss": 0.4711,
2345
+ "step": 384
2346
+ },
2347
+ {
2348
+ "epoch": 0.9,
2349
+ "learning_rate": 1.2942281777441168e-07,
2350
+ "loss": 0.5078,
2351
+ "step": 385
2352
+ },
2353
+ {
2354
+ "epoch": 0.9,
2355
+ "learning_rate": 1.2352264555381134e-07,
2356
+ "loss": 0.4752,
2357
+ "step": 386
2358
+ },
2359
+ {
2360
+ "epoch": 0.9,
2361
+ "learning_rate": 1.1775671236705366e-07,
2362
+ "loss": 0.4983,
2363
+ "step": 387
2364
+ },
2365
+ {
2366
+ "epoch": 0.91,
2367
+ "learning_rate": 1.121253439118769e-07,
2368
+ "loss": 0.4865,
2369
+ "step": 388
2370
+ },
2371
+ {
2372
+ "epoch": 0.91,
2373
+ "learning_rate": 1.0662885828492037e-07,
2374
+ "loss": 0.4872,
2375
+ "step": 389
2376
+ },
2377
+ {
2378
+ "epoch": 0.91,
2379
+ "learning_rate": 1.0126756596375687e-07,
2380
+ "loss": 0.4642,
2381
+ "step": 390
2382
+ },
2383
+ {
2384
+ "epoch": 0.91,
2385
+ "learning_rate": 9.604176978935342e-08,
2386
+ "loss": 0.4933,
2387
+ "step": 391
2388
+ },
2389
+ {
2390
+ "epoch": 0.92,
2391
+ "learning_rate": 9.095176494896662e-08,
2392
+ "loss": 0.5095,
2393
+ "step": 392
2394
+ },
2395
+ {
2396
+ "epoch": 0.92,
2397
+ "learning_rate": 8.599783895946762e-08,
2398
+ "loss": 0.5184,
2399
+ "step": 393
2400
+ },
2401
+ {
2402
+ "epoch": 0.92,
2403
+ "learning_rate": 8.118027165109926e-08,
2404
+ "loss": 0.5139,
2405
+ "step": 394
2406
+ },
2407
+ {
2408
+ "epoch": 0.92,
2409
+ "learning_rate": 7.649933515167407e-08,
2410
+ "loss": 0.4938,
2411
+ "step": 395
2412
+ },
2413
+ {
2414
+ "epoch": 0.92,
2415
+ "learning_rate": 7.195529387119815e-08,
2416
+ "loss": 0.4902,
2417
+ "step": 396
2418
+ },
2419
+ {
2420
+ "epoch": 0.93,
2421
+ "learning_rate": 6.75484044869379e-08,
2422
+ "loss": 0.4729,
2423
+ "step": 397
2424
+ },
2425
+ {
2426
+ "epoch": 0.93,
2427
+ "learning_rate": 6.327891592892126e-08,
2428
+ "loss": 0.4762,
2429
+ "step": 398
2430
+ },
2431
+ {
2432
+ "epoch": 0.93,
2433
+ "learning_rate": 5.914706936587494e-08,
2434
+ "loss": 0.4657,
2435
+ "step": 399
2436
+ },
2437
+ {
2438
+ "epoch": 0.93,
2439
+ "learning_rate": 5.515309819160402e-08,
2440
+ "loss": 0.4822,
2441
+ "step": 400
2442
+ },
2443
+ {
2444
+ "epoch": 0.94,
2445
+ "learning_rate": 5.129722801180542e-08,
2446
+ "loss": 0.4863,
2447
+ "step": 401
2448
+ },
2449
+ {
2450
+ "epoch": 0.94,
2451
+ "learning_rate": 4.75796766313269e-08,
2452
+ "loss": 0.4953,
2453
+ "step": 402
2454
+ },
2455
+ {
2456
+ "epoch": 0.94,
2457
+ "learning_rate": 4.4000654041862764e-08,
2458
+ "loss": 0.4952,
2459
+ "step": 403
2460
+ },
2461
+ {
2462
+ "epoch": 0.94,
2463
+ "learning_rate": 4.05603624100917e-08,
2464
+ "loss": 0.4605,
2465
+ "step": 404
2466
+ },
2467
+ {
2468
+ "epoch": 0.95,
2469
+ "learning_rate": 3.72589960662581e-08,
2470
+ "loss": 0.4967,
2471
+ "step": 405
2472
+ },
2473
+ {
2474
+ "epoch": 0.95,
2475
+ "learning_rate": 3.4096741493194196e-08,
2476
+ "loss": 0.4784,
2477
+ "step": 406
2478
+ },
2479
+ {
2480
+ "epoch": 0.95,
2481
+ "learning_rate": 3.107377731578709e-08,
2482
+ "loss": 0.5004,
2483
+ "step": 407
2484
+ },
2485
+ {
2486
+ "epoch": 0.95,
2487
+ "learning_rate": 2.819027429088822e-08,
2488
+ "loss": 0.4836,
2489
+ "step": 408
2490
+ },
2491
+ {
2492
+ "epoch": 0.96,
2493
+ "learning_rate": 2.544639529766829e-08,
2494
+ "loss": 0.475,
2495
+ "step": 409
2496
+ },
2497
+ {
2498
+ "epoch": 0.96,
2499
+ "learning_rate": 2.284229532841603e-08,
2500
+ "loss": 0.4853,
2501
+ "step": 410
2502
+ },
2503
+ {
2504
+ "epoch": 0.96,
2505
+ "learning_rate": 2.0378121479783798e-08,
2506
+ "loss": 0.5007,
2507
+ "step": 411
2508
+ },
2509
+ {
2510
+ "epoch": 0.96,
2511
+ "learning_rate": 1.8054012944479225e-08,
2512
+ "loss": 0.4842,
2513
+ "step": 412
2514
+ },
2515
+ {
2516
+ "epoch": 0.96,
2517
+ "learning_rate": 1.5870101003402083e-08,
2518
+ "loss": 0.4887,
2519
+ "step": 413
2520
+ },
2521
+ {
2522
+ "epoch": 0.97,
2523
+ "learning_rate": 1.382650901822713e-08,
2524
+ "loss": 0.492,
2525
+ "step": 414
2526
+ },
2527
+ {
2528
+ "epoch": 0.97,
2529
+ "learning_rate": 1.1923352424439149e-08,
2530
+ "loss": 0.48,
2531
+ "step": 415
2532
+ },
2533
+ {
2534
+ "epoch": 0.97,
2535
+ "learning_rate": 1.0160738724809549e-08,
2536
+ "loss": 0.5003,
2537
+ "step": 416
2538
+ },
2539
+ {
2540
+ "epoch": 0.97,
2541
+ "learning_rate": 8.538767483325384e-09,
2542
+ "loss": 0.5103,
2543
+ "step": 417
2544
+ },
2545
+ {
2546
+ "epoch": 0.98,
2547
+ "learning_rate": 7.05753031956441e-09,
2548
+ "loss": 0.5304,
2549
+ "step": 418
2550
+ },
2551
+ {
2552
+ "epoch": 0.98,
2553
+ "learning_rate": 5.717110903520617e-09,
2554
+ "loss": 0.5166,
2555
+ "step": 419
2556
+ },
2557
+ {
2558
+ "epoch": 0.98,
2559
+ "learning_rate": 4.517584950877451e-09,
2560
+ "loss": 0.4779,
2561
+ "step": 420
2562
+ },
2563
+ {
2564
+ "epoch": 0.98,
2565
+ "learning_rate": 3.4590202187315124e-09,
2566
+ "loss": 0.5138,
2567
+ "step": 421
2568
+ },
2569
+ {
2570
+ "epoch": 0.99,
2571
+ "learning_rate": 2.5414765017642285e-09,
2572
+ "loss": 0.5101,
2573
+ "step": 422
2574
+ },
2575
+ {
2576
+ "epoch": 0.99,
2577
+ "learning_rate": 1.765005628865113e-09,
2578
+ "loss": 0.4836,
2579
+ "step": 423
2580
+ },
2581
+ {
2582
+ "epoch": 0.99,
2583
+ "learning_rate": 1.1296514602038289e-09,
2584
+ "loss": 0.5055,
2585
+ "step": 424
2586
+ },
2587
+ {
2588
+ "epoch": 0.99,
2589
+ "learning_rate": 6.354498847521706e-10,
2590
+ "loss": 0.4525,
2591
+ "step": 425
2592
+ },
2593
+ {
2594
+ "epoch": 0.99,
2595
+ "learning_rate": 2.8242881825846225e-10,
2596
+ "loss": 0.4781,
2597
+ "step": 426
2598
+ },
2599
+ {
2600
+ "epoch": 1.0,
2601
+ "learning_rate": 7.060820166826521e-11,
2602
+ "loss": 0.5002,
2603
+ "step": 427
2604
+ },
2605
+ {
2606
+ "epoch": 1.0,
2607
+ "learning_rate": 0.0,
2608
+ "loss": 0.4674,
2609
+ "step": 428
2610
+ },
2611
+ {
2612
+ "epoch": 1.0,
2613
+ "eval_loss": 0.5059249997138977,
2614
+ "eval_runtime": 973.6949,
2615
+ "eval_samples_per_second": 1.156,
2616
+ "eval_steps_per_second": 0.145,
2617
+ "step": 428
2618
+ }
2619
+ ],
2620
+ "logging_steps": 1,
2621
+ "max_steps": 428,
2622
+ "num_input_tokens_seen": 0,
2623
+ "num_train_epochs": 1,
2624
+ "save_steps": 214,
2625
+ "total_flos": 179124295434240.0,
2626
+ "train_batch_size": 1,
2627
+ "trial_name": null,
2628
+ "trial_params": null
2629
+ }
zero_to_fp32.py ADDED
@@ -0,0 +1,592 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example: python zero_to_fp32.py . pytorch_model.bin
14
+
15
+ import argparse
16
+ import torch
17
+ import glob
18
+ import math
19
+ import os
20
+ import re
21
+ from collections import OrderedDict
22
+ from dataclasses import dataclass
23
+
24
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
25
+ # DeepSpeed data structures it has to be available in the current python environment.
26
+ from deepspeed.utils import logger
27
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
28
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
29
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
30
+
31
+
32
+ @dataclass
33
+ class zero_model_state:
34
+ buffers: dict()
35
+ param_shapes: dict()
36
+ shared_params: list
37
+ ds_version: int
38
+ frozen_param_shapes: dict()
39
+ frozen_param_fragments: dict()
40
+
41
+
42
+ debug = 0
43
+
44
+ # load to cpu
45
+ device = torch.device('cpu')
46
+
47
+
48
+ def atoi(text):
49
+ return int(text) if text.isdigit() else text
50
+
51
+
52
+ def natural_keys(text):
53
+ '''
54
+ alist.sort(key=natural_keys) sorts in human order
55
+ http://nedbatchelder.com/blog/200712/human_sorting.html
56
+ (See Toothy's implementation in the comments)
57
+ '''
58
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
59
+
60
+
61
+ def get_model_state_file(checkpoint_dir, zero_stage):
62
+ if not os.path.isdir(checkpoint_dir):
63
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
64
+
65
+ # there should be only one file
66
+ if zero_stage <= 2:
67
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
68
+ elif zero_stage == 3:
69
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
70
+
71
+ if not os.path.exists(file):
72
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
73
+
74
+ return file
75
+
76
+
77
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
78
+ # XXX: need to test that this simple glob rule works for multi-node setup too
79
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
80
+
81
+ if len(ckpt_files) == 0:
82
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
83
+
84
+ return ckpt_files
85
+
86
+
87
+ def get_optim_files(checkpoint_dir):
88
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
89
+
90
+
91
+ def get_model_state_files(checkpoint_dir):
92
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
93
+
94
+
95
+ def parse_model_states(files):
96
+ zero_model_states = []
97
+ for file in files:
98
+ state_dict = torch.load(file, map_location=device)
99
+
100
+ if BUFFER_NAMES not in state_dict:
101
+ raise ValueError(f"{file} is not a model state checkpoint")
102
+ buffer_names = state_dict[BUFFER_NAMES]
103
+ if debug:
104
+ print("Found buffers:", buffer_names)
105
+
106
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
107
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
108
+ param_shapes = state_dict[PARAM_SHAPES]
109
+
110
+ # collect parameters that are included in param_shapes
111
+ param_names = []
112
+ for s in param_shapes:
113
+ for name in s.keys():
114
+ param_names.append(name)
115
+
116
+ # update with frozen parameters
117
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
118
+ if frozen_param_shapes is not None:
119
+ if debug:
120
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
121
+ param_names += list(frozen_param_shapes.keys())
122
+
123
+ # handle shared params
124
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
125
+
126
+ ds_version = state_dict.get(DS_VERSION, None)
127
+
128
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
129
+
130
+ z_model_state = zero_model_state(buffers=buffers,
131
+ param_shapes=param_shapes,
132
+ shared_params=shared_params,
133
+ ds_version=ds_version,
134
+ frozen_param_shapes=frozen_param_shapes,
135
+ frozen_param_fragments=frozen_param_fragments)
136
+ zero_model_states.append(z_model_state)
137
+
138
+ return zero_model_states
139
+
140
+
141
+ def parse_optim_states(files, ds_checkpoint_dir):
142
+
143
+ total_files = len(files)
144
+ state_dicts = []
145
+ for f in files:
146
+ state_dict = torch.load(f, map_location=device)
147
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
148
+ # and also handle the case where it was already removed by another helper script
149
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
150
+ state_dicts.append(state_dict)
151
+
152
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
153
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
154
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
155
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
156
+
157
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
158
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
159
+ # use the max of the partition_count to get the dp world_size.
160
+
161
+ if type(world_size) is list:
162
+ world_size = max(world_size)
163
+
164
+ if world_size != total_files:
165
+ raise ValueError(
166
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
167
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
168
+ )
169
+
170
+ # the groups are named differently in each stage
171
+ if zero_stage <= 2:
172
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
173
+ elif zero_stage == 3:
174
+ fp32_groups_key = FP32_FLAT_GROUPS
175
+ else:
176
+ raise ValueError(f"unknown zero stage {zero_stage}")
177
+
178
+ if zero_stage <= 2:
179
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
180
+ elif zero_stage == 3:
181
+ # if there is more than one param group, there will be multiple flattened tensors - one
182
+ # flattened tensor per group - for simplicity merge them into a single tensor
183
+ #
184
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
185
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
186
+
187
+ fp32_flat_groups = [
188
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
189
+ ]
190
+
191
+ return zero_stage, world_size, fp32_flat_groups
192
+
193
+
194
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
195
+ """
196
+ Returns fp32 state_dict reconstructed from ds checkpoint
197
+
198
+ Args:
199
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
200
+
201
+ """
202
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
203
+
204
+ optim_files = get_optim_files(ds_checkpoint_dir)
205
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
206
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
207
+
208
+ model_files = get_model_state_files(ds_checkpoint_dir)
209
+
210
+ zero_model_states = parse_model_states(model_files)
211
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
212
+
213
+ if zero_stage <= 2:
214
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
215
+ elif zero_stage == 3:
216
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
217
+
218
+
219
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
220
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
221
+ return
222
+
223
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
224
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
225
+
226
+ if debug:
227
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
228
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
229
+
230
+ wanted_params = len(frozen_param_shapes)
231
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
232
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
233
+ print(f'Frozen params: Have {avail_numel} numels to process.')
234
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
235
+
236
+ total_params = 0
237
+ total_numel = 0
238
+ for name, shape in frozen_param_shapes.items():
239
+ total_params += 1
240
+ unpartitioned_numel = shape.numel()
241
+ total_numel += unpartitioned_numel
242
+
243
+ state_dict[name] = frozen_param_fragments[name]
244
+
245
+ if debug:
246
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
247
+
248
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
249
+
250
+
251
+ def _has_callable(obj, fn):
252
+ attr = getattr(obj, fn, None)
253
+ return callable(attr)
254
+
255
+
256
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
257
+ param_shapes = zero_model_states[0].param_shapes
258
+
259
+ # Reconstruction protocol:
260
+ #
261
+ # XXX: document this
262
+
263
+ if debug:
264
+ for i in range(world_size):
265
+ for j in range(len(fp32_flat_groups[0])):
266
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
267
+
268
+ # XXX: memory usage doubles here (zero2)
269
+ num_param_groups = len(fp32_flat_groups[0])
270
+ merged_single_partition_of_fp32_groups = []
271
+ for i in range(num_param_groups):
272
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
273
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
274
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
275
+ avail_numel = sum(
276
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
277
+
278
+ if debug:
279
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
280
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
281
+ # not asserting if there is a mismatch due to possible padding
282
+ print(f"Have {avail_numel} numels to process.")
283
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
284
+
285
+ # params
286
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
287
+ # out-of-core computing solution
288
+ total_numel = 0
289
+ total_params = 0
290
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
291
+ offset = 0
292
+ avail_numel = full_single_fp32_vector.numel()
293
+ for name, shape in shapes.items():
294
+
295
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
296
+ total_numel += unpartitioned_numel
297
+ total_params += 1
298
+
299
+ if debug:
300
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
301
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
302
+ offset += unpartitioned_numel
303
+
304
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
305
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
306
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
307
+ # live optimizer object, so we are checking that the numbers are within the right range
308
+ align_to = 2 * world_size
309
+
310
+ def zero2_align(x):
311
+ return align_to * math.ceil(x / align_to)
312
+
313
+ if debug:
314
+ print(f"original offset={offset}, avail_numel={avail_numel}")
315
+
316
+ offset = zero2_align(offset)
317
+ avail_numel = zero2_align(avail_numel)
318
+
319
+ if debug:
320
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
321
+
322
+ # Sanity check
323
+ if offset != avail_numel:
324
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
325
+
326
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
327
+
328
+
329
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
330
+ state_dict = OrderedDict()
331
+
332
+ # buffers
333
+ buffers = zero_model_states[0].buffers
334
+ state_dict.update(buffers)
335
+ if debug:
336
+ print(f"added {len(buffers)} buffers")
337
+
338
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
339
+
340
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
341
+
342
+ # recover shared parameters
343
+ for pair in zero_model_states[0].shared_params:
344
+ if pair[1] in state_dict:
345
+ state_dict[pair[0]] = state_dict[pair[1]]
346
+
347
+ return state_dict
348
+
349
+
350
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
351
+ remainder = unpartitioned_numel % world_size
352
+ padding_numel = (world_size - remainder) if remainder else 0
353
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
354
+ return partitioned_numel, padding_numel
355
+
356
+
357
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
358
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
359
+ return
360
+
361
+ if debug:
362
+ for i in range(world_size):
363
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
364
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
365
+
366
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
367
+ wanted_params = len(frozen_param_shapes)
368
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
369
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
370
+ print(f'Frozen params: Have {avail_numel} numels to process.')
371
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
372
+
373
+ total_params = 0
374
+ total_numel = 0
375
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
376
+ total_params += 1
377
+ unpartitioned_numel = shape.numel()
378
+ total_numel += unpartitioned_numel
379
+
380
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
381
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
382
+
383
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
384
+
385
+ if debug:
386
+ print(
387
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
388
+ )
389
+
390
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
391
+
392
+
393
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
394
+ param_shapes = zero_model_states[0].param_shapes
395
+ avail_numel = fp32_flat_groups[0].numel() * world_size
396
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
397
+ # param, re-consolidating each param, while dealing with padding if any
398
+
399
+ # merge list of dicts, preserving order
400
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
401
+
402
+ if debug:
403
+ for i in range(world_size):
404
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
405
+
406
+ wanted_params = len(param_shapes)
407
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
408
+ # not asserting if there is a mismatch due to possible padding
409
+ avail_numel = fp32_flat_groups[0].numel() * world_size
410
+ print(f"Trainable params: Have {avail_numel} numels to process.")
411
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
412
+
413
+ # params
414
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
415
+ # out-of-core computing solution
416
+ offset = 0
417
+ total_numel = 0
418
+ total_params = 0
419
+ for name, shape in param_shapes.items():
420
+
421
+ unpartitioned_numel = shape.numel()
422
+ total_numel += unpartitioned_numel
423
+ total_params += 1
424
+
425
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
426
+
427
+ if debug:
428
+ print(
429
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
430
+ )
431
+
432
+ # XXX: memory usage doubles here
433
+ state_dict[name] = torch.cat(
434
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
435
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
436
+ offset += partitioned_numel
437
+
438
+ offset *= world_size
439
+
440
+ # Sanity check
441
+ if offset != avail_numel:
442
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
443
+
444
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
445
+
446
+
447
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
448
+ state_dict = OrderedDict()
449
+
450
+ # buffers
451
+ buffers = zero_model_states[0].buffers
452
+ state_dict.update(buffers)
453
+ if debug:
454
+ print(f"added {len(buffers)} buffers")
455
+
456
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
457
+
458
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
459
+
460
+ # recover shared parameters
461
+ for pair in zero_model_states[0].shared_params:
462
+ if pair[1] in state_dict:
463
+ state_dict[pair[0]] = state_dict[pair[1]]
464
+
465
+ return state_dict
466
+
467
+
468
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
469
+ """
470
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
471
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
472
+ via a model hub.
473
+
474
+ Args:
475
+ - ``checkpoint_dir``: path to the desired checkpoint folder
476
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
477
+
478
+ Returns:
479
+ - pytorch ``state_dict``
480
+
481
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
482
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
483
+ the checkpoint.
484
+
485
+ A typical usage might be ::
486
+
487
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
488
+ # do the training and checkpoint saving
489
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
490
+ model = model.cpu() # move to cpu
491
+ model.load_state_dict(state_dict)
492
+ # submit to model hub or save the model to share with others
493
+
494
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
495
+ application. i.e. you will need to re-initialize the deepspeed engine, since
496
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
497
+
498
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
499
+
500
+ """
501
+ if tag is None:
502
+ latest_path = os.path.join(checkpoint_dir, 'latest')
503
+ if os.path.isfile(latest_path):
504
+ with open(latest_path, 'r') as fd:
505
+ tag = fd.read().strip()
506
+ else:
507
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
508
+
509
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
510
+
511
+ if not os.path.isdir(ds_checkpoint_dir):
512
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
513
+
514
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
515
+
516
+
517
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
518
+ """
519
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
520
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
521
+
522
+ Args:
523
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
524
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
525
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
526
+ """
527
+
528
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
529
+ print(f"Saving fp32 state dict to {output_file}")
530
+ torch.save(state_dict, output_file)
531
+
532
+
533
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
534
+ """
535
+ 1. Put the provided model to cpu
536
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
537
+ 3. Load it into the provided model
538
+
539
+ Args:
540
+ - ``model``: the model object to update
541
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
542
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
543
+
544
+ Returns:
545
+ - ``model`: modified model
546
+
547
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
548
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
549
+ conveniently placed for you in the checkpoint folder.
550
+
551
+ A typical usage might be ::
552
+
553
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
554
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
555
+ # submit to model hub or save the model to share with others
556
+
557
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
558
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
559
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
560
+
561
+ """
562
+ logger.info(f"Extracting fp32 weights")
563
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
564
+
565
+ logger.info(f"Overwriting model with fp32 weights")
566
+ model = model.cpu()
567
+ model.load_state_dict(state_dict, strict=False)
568
+
569
+ return model
570
+
571
+
572
+ if __name__ == "__main__":
573
+
574
+ parser = argparse.ArgumentParser()
575
+ parser.add_argument("checkpoint_dir",
576
+ type=str,
577
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
578
+ parser.add_argument(
579
+ "output_file",
580
+ type=str,
581
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
582
+ parser.add_argument("-t",
583
+ "--tag",
584
+ type=str,
585
+ default=None,
586
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
587
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
588
+ args = parser.parse_args()
589
+
590
+ debug = args.debug
591
+
592
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)