mattbonnell commited on Sep 26, 2024

Commit

8403713

verified ·

1 Parent(s): 2715521

Training in progress, step 5500, checkpoint

Browse files

Files changed (17) hide show

last-checkpoint/global_step5500/mp_rank_00_model_states.pt +1 -1
last-checkpoint/global_step5500/zero_pp_rank_0_mp_rank_00_optim_states.pt +1 -1
last-checkpoint/global_step5500/zero_pp_rank_1_mp_rank_00_optim_states.pt +1 -1
last-checkpoint/global_step5500/zero_pp_rank_2_mp_rank_00_optim_states.pt +1 -1
last-checkpoint/global_step5500/zero_pp_rank_3_mp_rank_00_optim_states.pt +1 -1
last-checkpoint/global_step5500/zero_pp_rank_4_mp_rank_00_optim_states.pt +1 -1
last-checkpoint/global_step5500/zero_pp_rank_5_mp_rank_00_optim_states.pt +1 -1
last-checkpoint/latest +1 -1
last-checkpoint/model.safetensors +1 -1
last-checkpoint/rng_state_0.pth +1 -1
last-checkpoint/rng_state_1.pth +2 -2
last-checkpoint/rng_state_2.pth +1 -1
last-checkpoint/rng_state_3.pth +1 -1
last-checkpoint/rng_state_4.pth +2 -2
last-checkpoint/rng_state_5.pth +2 -2
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +73 -3

last-checkpoint/global_step5500/mp_rank_00_model_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c3bc55a30f02a3f26350bd9dbb16294a15a897b075fd34456a88839ee77910c4
 size 197282509

 version https://git-lfs.github.com/spec/v1
+oid sha256:a0cf14511d531f879ee2f6d406031ad511a3c06b18eed385a73b0dc215190115
 size 197282509

last-checkpoint/global_step5500/zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6cd182af3b4c944245fd020d35a0d824bf71c439f21ecda9cfb84927e3bf4593
 size 180416968

 version https://git-lfs.github.com/spec/v1
+oid sha256:f1b95be6942dace2bc3d825fa1446c5b063d8b4ee4c7ada108b22b131515ff13
 size 180416968

last-checkpoint/global_step5500/zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cbdab2ae29a88e68fc1af1995e70903688b507c709051f3fc347179a065e0e55
 size 180416776

 version https://git-lfs.github.com/spec/v1
+oid sha256:6346f87e421ad31c9b43ee470eab53306c511165d8d0c10038223ac6d789e927
 size 180416776

last-checkpoint/global_step5500/zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ffbf390a132b39ce10451e04f633f16d3d9d6dbcf32a1b84272612507c10194d
 size 180416776

 version https://git-lfs.github.com/spec/v1
+oid sha256:77cab693c8a4d6e0a3940be216a317089eeeb43f17a2d2d833c433151aad0dc5
 size 180416776

last-checkpoint/global_step5500/zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e237aedc1ff457bcc7f26444494d869759759d1ba70524cc04e3661daf6c5a5a
 size 180416904

 version https://git-lfs.github.com/spec/v1
+oid sha256:733ad6e8b2d322d4e9fe62f82fd3e8f7f5bf482a7a6145cc963bda9577b003ad
 size 180416904

last-checkpoint/global_step5500/zero_pp_rank_4_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f78a218d94aea89b6a2546b76fa9c2b6630eedc525cf3c9cc507aecb2c8cea1
 size 180416712

 version https://git-lfs.github.com/spec/v1
+oid sha256:ededfa3ecb0f275dc715e2d4cfb49f679e813f3991d87dbadd47b80273eed9f3
 size 180416712

last-checkpoint/global_step5500/zero_pp_rank_5_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96767b9e7ca20ce511539e7c55cb887f1c5f3515e14ce6d12290cd51ccc0b127
 size 180417096

 version https://git-lfs.github.com/spec/v1
+oid sha256:9c4d3ed7c0c49b879a17e3ce8fa63eb45569d90d6a482ab717098c73b241d5f4
 size 180417096

last-checkpoint/latest CHANGED Viewed

	@@ -1 +1 @@
1	- ~~global_step5000~~


1	+ global_step5500

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:57d4fd4308febebec1afff358cf12ab56256227add3b2957e331f5cd2e0691a4
 size 188836816

 version https://git-lfs.github.com/spec/v1
+oid sha256:706a405a712fb9cea8d3cdff6665196d4ab0a60404fa36039d2d2dfad216e4b0
 size 188836816

last-checkpoint/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e3f3878f9de7b43bbfd935e88b0d1db5f297e106844e639e3ee8f61d23b71537
 size 15536

 version https://git-lfs.github.com/spec/v1
+oid sha256:4637f1f5c1da9bb29154d9963121566ca1c02eb09711378004397eb0d9601a2c
 size 15536

last-checkpoint/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:08dc69792353f98f3975f2146c88ff60332193593dccd2ba5cecd781b9997d67
-size 15536

 version https://git-lfs.github.com/spec/v1
+oid sha256:23c45dd7c491ef8cfd05485f13a9666de8a90413d508f927cbd46a1f5f1b8ffb
+size 15472

last-checkpoint/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b3d105c1a1aa212e4c659fcde3507788ba2d1192d170b13840512005d89c43c0
 size 15536

 version https://git-lfs.github.com/spec/v1
+oid sha256:38614bdb1925ed297bb20ccfe06b7ec2af4d8a762a5ae888492a250a49f694a7
 size 15536

last-checkpoint/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:67b13c4c348b0a86bf4854b80c140b765c791fc26576cf4cc28641b09a92b2c2
 size 15536

 version https://git-lfs.github.com/spec/v1
+oid sha256:404fdaaba0be21984fe31d2c6a0497ee60427a53f8ff8c86eca1f8f176009870
 size 15536

last-checkpoint/rng_state_4.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a7cca33316c4c01433fe48556b2852d35fc05fe7ebfbd4c36c5bfb4e6f315283
-size 15472

 version https://git-lfs.github.com/spec/v1
+oid sha256:ced3ab1b81f732a49c9788e74bc896f0ee9b2b5818128f9450e4a5af9190e6f2
+size 15536

last-checkpoint/rng_state_5.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:67a8ec020afb0b24788f834efe44fce3daac72b42efaf574b93244a2db48d417
-size 15536

 version https://git-lfs.github.com/spec/v1
+oid sha256:6301d10b310bc985fcae6a7295ae99ac9e593c77c7cac2ef61d2dba1e9541e7d
+size 15600

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2464b12e1ca0e0289d4336d5d6478dcc4211982d590e1532eea5a3218a3c1d09
 size 1256

 version https://git-lfs.github.com/spec/v1
+oid sha256:ff0c32be67ef9a7f2ea73a46b364adf3b30fe6a1d94a39f03dae5f70dfb3e7d2
 size 1256

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 135.13513513513513,
   "eval_steps": 1500,
-  "global_step": 5000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -725,6 +725,76 @@
       "learning_rate": 0.0001,
       "loss": 0.0331,
       "step": 5000
     }
   ],
   "logging_steps": 50,
@@ -744,7 +814,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.4035163536540303e+20,
   "train_batch_size": 64,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 148.64864864864865,
   "eval_steps": 1500,
+  "global_step": 5500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 0.0001,
       "loss": 0.0331,
       "step": 5000
+    },
+    {
+      "epoch": 136.48648648648648,
+      "grad_norm": 0.4285108149051666,
+      "learning_rate": 0.0001,
+      "loss": 0.0304,
+      "step": 5050
+    },
+    {
+      "epoch": 137.83783783783784,
+      "grad_norm": 0.3434101343154907,
+      "learning_rate": 0.0001,
+      "loss": 0.0294,
+      "step": 5100
+    },
+    {
+      "epoch": 139.1891891891892,
+      "grad_norm": 0.41777992248535156,
+      "learning_rate": 0.0001,
+      "loss": 0.0302,
+      "step": 5150
+    },
+    {
+      "epoch": 140.54054054054055,
+      "grad_norm": 0.3897533714771271,
+      "learning_rate": 0.0001,
+      "loss": 0.0303,
+      "step": 5200
+    },
+    {
+      "epoch": 141.8918918918919,
+      "grad_norm": 0.3457304537296295,
+      "learning_rate": 0.0001,
+      "loss": 0.0297,
+      "step": 5250
+    },
+    {
+      "epoch": 143.24324324324326,
+      "grad_norm": 0.38188374042510986,
+      "learning_rate": 0.0001,
+      "loss": 0.0291,
+      "step": 5300
+    },
+    {
+      "epoch": 144.59459459459458,
+      "grad_norm": 0.44426918029785156,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 5350
+    },
+    {
+      "epoch": 145.94594594594594,
+      "grad_norm": 0.46593207120895386,
+      "learning_rate": 0.0001,
+      "loss": 0.0306,
+      "step": 5400
+    },
+    {
+      "epoch": 147.2972972972973,
+      "grad_norm": 0.5084848403930664,
+      "learning_rate": 0.0001,
+      "loss": 0.0293,
+      "step": 5450
+    },
+    {
+      "epoch": 148.64864864864865,
+      "grad_norm": 0.35385948419570923,
+      "learning_rate": 0.0001,
+      "loss": 0.0307,
+      "step": 5500
     }
   ],
   "logging_steps": 50,
       "attributes": {}
     }
   },
+  "total_flos": 1.5440555615799634e+20,
   "train_batch_size": 64,
   "trial_name": null,
   "trial_params": null