3outeille HF staff commited on
Commit
9e80437
1 Parent(s): 7b3178d

Upload llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2

Browse files
.gitattributes CHANGED
@@ -66,3 +66,4 @@ llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-16/profiler/ip-26-0-163-147_683312.171994997
66
  llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-4/profiler/ip-26-0-171-21_2582701.1719950103572137437.pt.trace.json.tmp filter=lfs diff=lfs merge=lfs -text
67
  llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-8/profiler/ip-26-0-169-139_2571529.1719950310974795475.pt.trace.json filter=lfs diff=lfs merge=lfs -text
68
  llama-1B/16_GPUS/dp-4_tp-2_pp-2_mbz-4/profiler/ip-26-0-160-225_1672146.1719950266162829584.pt.trace.json filter=lfs diff=lfs merge=lfs -text
 
 
66
  llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-4/profiler/ip-26-0-171-21_2582701.1719950103572137437.pt.trace.json.tmp filter=lfs diff=lfs merge=lfs -text
67
  llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-8/profiler/ip-26-0-169-139_2571529.1719950310974795475.pt.trace.json filter=lfs diff=lfs merge=lfs -text
68
  llama-1B/16_GPUS/dp-4_tp-2_pp-2_mbz-4/profiler/ip-26-0-160-225_1672146.1719950266162829584.pt.trace.json filter=lfs diff=lfs merge=lfs -text
69
+ llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2/profiler/ip-26-0-163-147_704351.1719950608420005259.pt.trace.json filter=lfs diff=lfs merge=lfs -text
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2/log.out CHANGED
@@ -1,5 +1,5 @@
1
  ========================
2
- START TIME: Tue Jul 2 16:30:13 UTC 2024
3
  python3 version = Python 3.10.14
4
  ========================
5
  The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
@@ -14,120 +14,284 @@ M src/nanotron/models/llama.py
14
  M src/nanotron/trainer.py
15
  Your branch is up to date with 'origin/bench_cluster'.
16
  Job status: RUNNING
17
- W0702 16:30:16.125000 140327896655680 torch/distributed/run.py:757]
18
- W0702 16:30:16.125000 140327896655680 torch/distributed/run.py:757] *****************************************
19
- W0702 16:30:16.125000 140327896655680 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
- W0702 16:30:16.125000 140327896655680 torch/distributed/run.py:757] *****************************************
21
- W0702 16:30:16.135000 139775981082432 torch/distributed/run.py:757]
22
- W0702 16:30:16.135000 139775981082432 torch/distributed/run.py:757] *****************************************
23
- W0702 16:30:16.135000 139775981082432 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
24
- W0702 16:30:16.135000 139775981082432 torch/distributed/run.py:757] *****************************************
25
- [default0]:[W socket.cpp:464] [c10d] The server socket has failed to bind to [::]:36391 (errno: 98 - Address already in use).
26
- [default0]:[W socket.cpp:464] [c10d] The server socket has failed to bind to ?UNKNOWN? (errno: 98 - Address already in use).
27
- [default0]:[E socket.cpp:500] [c10d] The server socket has failed to listen on any local network address.
28
- [default0]:Traceback (most recent call last):
29
- [default0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 233, in <module>
30
- [default0]: trainer = DistributedTrainer(config_file)
31
- [default0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 145, in __init__
32
- [default0]: self.parallel_context = ParallelContext(
33
- [default0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/context.py", line 53, in __init__
34
- [default0]: dist.initialize_torch_distributed()
35
- [default0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/distributed.py", line 278, in initialize_torch_distributed
36
- [default0]: dist.init_process_group(
37
- [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper
38
- [default0]: return func(*args, **kwargs)
39
- [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 89, in wrapper
40
- [default0]: func_return = func(*args, **kwargs)
41
- [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1305, in init_process_group
42
- [default0]: store, rank, world_size = next(rendezvous_iterator)
43
- [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/rendezvous.py", line 246, in _env_rendezvous_handler
44
- [default0]: store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout, use_libuv)
45
- [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/rendezvous.py", line 174, in _create_c10d_store
46
- [default0]: return TCPStore(
47
- [default0]:torch.distributed.DistNetworkError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:36391 (errno: 98 - Address already in use). The server socket has failed to bind to ?UNKNOWN? (errno: 98 - Address already in use).
48
- W0702 16:30:27.302000 140327896655680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 853935 closing signal SIGTERM
49
- W0702 16:30:27.303000 140327896655680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 853936 closing signal SIGTERM
50
- W0702 16:30:27.303000 140327896655680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 853937 closing signal SIGTERM
51
- W0702 16:30:27.303000 140327896655680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 853938 closing signal SIGTERM
52
- W0702 16:30:27.303000 140327896655680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 853939 closing signal SIGTERM
53
- W0702 16:30:27.303000 140327896655680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 853940 closing signal SIGTERM
54
- W0702 16:30:27.304000 140327896655680 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 853941 closing signal SIGTERM
55
- E0702 16:30:27.706000 140327896655680 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 853934) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  Traceback (most recent call last):
57
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
58
- sys.exit(main())
59
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
60
- return f(*args, **kwargs)
61
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
62
- run(args)
63
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
64
- elastic_launch(
65
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
66
- return launch_agent(self._config, self._entrypoint, list(args))
67
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
68
- raise ChildFailedError(
69
- torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
70
- ============================================================
71
- /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
72
- ------------------------------------------------------------
73
- Failures:
74
- <NO_OTHER_FAILURES>
75
- ------------------------------------------------------------
76
- Root Cause (first observed failure):
77
- [0]:
78
- time : 2024-07-02_16:30:27
79
- host : ip-26-0-163-43.ec2.internal
80
- rank : 0 (local_rank: 0)
81
- exitcode : 1 (pid: 853934)
82
- error_file: <N/A>
83
- traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
84
- ============================================================
85
- srun: error: ip-26-0-163-43: task 0: Exited with exit code 1
86
- W0702 16:30:31.395000 139770314262272 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-169-207.ec2.internal_2422832_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError.
87
- W0702 16:30:32.306000 139775981082432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2422901 closing signal SIGTERM
88
- W0702 16:30:32.307000 139775981082432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2422902 closing signal SIGTERM
89
- W0702 16:30:32.307000 139775981082432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2422903 closing signal SIGTERM
90
- W0702 16:30:32.307000 139775981082432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2422904 closing signal SIGTERM
91
- W0702 16:30:32.307000 139775981082432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2422905 closing signal SIGTERM
92
- W0702 16:30:32.307000 139775981082432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2422906 closing signal SIGTERM
93
- W0702 16:30:32.307000 139775981082432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2422907 closing signal SIGTERM
94
- W0702 16:30:32.307000 139775981082432 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2422908 closing signal SIGTERM
95
- W0702 16:30:32.813000 139775981082432 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2422832_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError.
96
- W0702 16:30:32.820000 139775981082432 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-169-207.ec2.internal_2422832_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError.
97
  Traceback (most recent call last):
98
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store
99
- return getattr(self._store, store_op)(*args, **kwargs)
100
- torch.distributed.DistNetworkError: Broken pipe
101
-
102
- The above exception was the direct cause of the following exception:
103
-
104
- Traceback (most recent call last):
105
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
106
- sys.exit(main())
107
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
108
- return f(*args, **kwargs)
109
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
110
- run(args)
111
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
112
- elastic_launch(
113
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
114
- return launch_agent(self._config, self._entrypoint, list(args))
115
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent
116
- result = agent.run()
117
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper
118
- result = f(*args, **kwargs)
119
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run
120
- result = self._invoke_run(role)
121
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run
122
- num_nodes_waiting = rdzv_handler.num_nodes_waiting()
123
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting
124
- self._state_holder.sync()
125
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync
126
- get_response = self._backend.get_state()
127
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state
128
- base64_state: bytes = self._call_store("get", self._key)
129
- File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store
130
- raise RendezvousConnectionError(
131
- torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details.
132
- srun: error: ip-26-0-169-207: task 1: Exited with exit code 1
133
  Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
 
 
1
  ========================
2
+ START TIME: Tue Jul 2 19:59:42 UTC 2024
3
  python3 version = Python 3.10.14
4
  ========================
5
  The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
 
14
  M src/nanotron/trainer.py
15
  Your branch is up to date with 'origin/bench_cluster'.
16
  Job status: RUNNING
17
+ W0702 19:59:45.097000 139860515530560 torch/distributed/run.py:757]
18
+ W0702 19:59:45.097000 139860515530560 torch/distributed/run.py:757] *****************************************
19
+ W0702 19:59:45.097000 139860515530560 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0702 19:59:45.097000 139860515530560 torch/distributed/run.py:757] *****************************************
21
+ W0702 19:59:45.099000 139859587696448 torch/distributed/run.py:757]
22
+ W0702 19:59:45.099000 139859587696448 torch/distributed/run.py:757] *****************************************
23
+ W0702 19:59:45.099000 139859587696448 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
24
+ W0702 19:59:45.099000 139859587696448 torch/distributed/run.py:757] *****************************************
25
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Config:
26
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Config(general=GeneralArgs(project='bench_cluster',
27
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: run='%date_%jobid',
28
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: seed=42,
29
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: step=None,
30
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: consumed_train_samples=None,
31
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: benchmark_csv_path=None,
32
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: ignore_sanity_checks=True),
33
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: parallelism=ParallelismArgs(dp=16,
34
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: pp=1,
35
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: tp=1,
36
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7ff74a400910>,
37
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
38
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: tp_linear_async_communication=False,
39
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: expert_parallel_size=1),
40
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
41
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: eos_token_id=2,
42
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: hidden_act='silu',
43
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: hidden_size=2048,
44
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: initializer_range=0.02,
45
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: intermediate_size=4096,
46
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: is_llama_config=True,
47
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: max_position_embeddings=4096,
48
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: num_attention_heads=32,
49
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: num_hidden_layers=24,
50
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: num_key_value_heads=32,
51
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: pad_token_id=None,
52
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: pretraining_tp=1,
53
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: rms_norm_eps=1e-05,
54
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: rope_scaling=None,
55
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: rope_theta=10000.0,
56
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: tie_word_embeddings=True,
57
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: use_cache=True,
58
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: vocab_size=50257),
59
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: init_method=RandomInit(std=0.025),
60
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: dtype=torch.bfloat16,
61
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: make_vocab_size_divisible_by=1,
62
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: ddp_bucket_cap_mb=25),
63
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
64
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: tokenizer_revision=None,
65
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: tokenizer_max_length=None),
66
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
67
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: checkpoint_interval=100000,
68
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: save_initial_state=False,
69
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: resume_checkpoint_path=None,
70
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: checkpoints_path_is_shared_file_system=False),
71
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: logging=LoggingArgs(log_level='info',
72
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: log_level_replica='info',
73
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration_step_info_interval=1),
74
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: tokens=TokensArgs(sequence_length=4096,
75
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: train_steps=20,
76
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: micro_batch_size=2,
77
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: batch_accumulation_per_replica=32,
78
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: val_check_interval=-1,
79
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: limit_val_batches=0,
80
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: limit_test_batches=0),
81
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
82
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: adam_beta1=0.9,
83
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: adam_beta2=0.95,
84
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: torch_adam_is_fused=True,
85
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: name='adamW'),
86
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: zero_stage=1,
87
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: weight_decay=0.01,
88
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: clip_grad=1.0,
89
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: accumulate_grad_in_fp32=True,
90
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
91
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: lr_warmup_steps=1,
92
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: lr_warmup_style='linear',
93
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: lr_decay_style='linear',
94
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: lr_decay_steps=19,
95
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: lr_decay_starting_step=None,
96
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: min_decay_lr=1e-05)),
97
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: data_stages=[DatasetStageArgs(name='Training Stage',
98
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: start_training_step=1,
99
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
100
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: hf_dataset_splits='train',
101
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: hf_dataset_config_name=None,
102
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: dataset_processing_num_proc_per_process=64,
103
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: dataset_overwrite_cache=False,
104
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: text_column_name='text'),
105
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: seed=42,
106
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: num_loading_workers=32))],
107
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2')),
108
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: lighteval=None)
109
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Model Config:
110
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: LlamaConfig(bos_token_id=1,
111
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: eos_token_id=2,
112
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: hidden_act='silu',
113
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: hidden_size=2048,
114
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: initializer_range=0.02,
115
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: intermediate_size=4096,
116
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: is_llama_config=True,
117
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: max_position_embeddings=4096,
118
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: num_attention_heads=32,
119
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: num_hidden_layers=24,
120
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: num_key_value_heads=32,
121
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: pad_token_id=None,
122
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: pretraining_tp=1,
123
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: rms_norm_eps=1e-05,
124
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: rope_scaling=None,
125
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: rope_theta=10000.0,
126
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: tie_word_embeddings=True,
127
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: use_cache=True,
128
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: vocab_size=50257)
129
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Building model..
130
+ [default0]:07/02/2024 20:00:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Setting PP block ranks...
131
+ [default4]:07/02/2024 20:00:11 [INFO|DP=12|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided.
132
+ [default5]:07/02/2024 20:00:11 [INFO|DP=13|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided.
133
+ [default6]:07/02/2024 20:00:11 [INFO|DP=14|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided.
134
+ [default1]:07/02/2024 20:00:11 [INFO|DP=9|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided.
135
+ [default2]:07/02/2024 20:00:11 [INFO|DP=10|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided.
136
+ [default3]:07/02/2024 20:00:11 [INFO|DP=11|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided.
137
+ [default0]:07/02/2024 20:00:11 [INFO|DP=8|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided.
138
+ [default7]:07/02/2024 20:00:11 [INFO|DP=15|PP=0|TP=0|ip-26-0-163-226]: No checkpoint path provided.
139
+ [default0]:07/02/2024 20:00:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Total number of parameters: 1.11G (2116.51MiB)
140
+ [default0]:07/02/2024 20:00:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Local number of parameters: 1.11G (2116.51MiB)
141
+ [default0]:07/02/2024 20:00:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [After model building] Memory usage: 2140.53MiB. Peak allocated: 2338.88MiB Peak reserved: 2392.00MiB
142
+ [default0]:07/02/2024 20:00:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: No checkpoint path provided.
143
+ [default0]:07/02/2024 20:00:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Parametrizing model parameters using StandardParametrizator
144
+ [default2]:07/02/2024 20:00:11 [INFO|DP=2|PP=0|TP=0|ip-26-0-163-147]: No checkpoint path provided.
145
+ [default1]:07/02/2024 20:00:11 [INFO|DP=1|PP=0|TP=0|ip-26-0-163-147]: No checkpoint path provided.
146
+ [default4]:07/02/2024 20:00:11 [INFO|DP=4|PP=0|TP=0|ip-26-0-163-147]: No checkpoint path provided.
147
+ [default6]:07/02/2024 20:00:11 [INFO|DP=6|PP=0|TP=0|ip-26-0-163-147]: No checkpoint path provided.
148
+ [default5]:07/02/2024 20:00:11 [INFO|DP=5|PP=0|TP=0|ip-26-0-163-147]: No checkpoint path provided.
149
+ [default7]:07/02/2024 20:00:11 [INFO|DP=7|PP=0|TP=0|ip-26-0-163-147]: No checkpoint path provided.
150
+ [default3]:07/02/2024 20:00:11 [INFO|DP=3|PP=0|TP=0|ip-26-0-163-147]: No checkpoint path provided.
151
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [Optimizer Building] Using LearningRateForSP as learning rate
152
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] Size of optimizer params per rank:
153
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 0 has 69.4M out of 1.11G (6.25%) params' optimizer states
154
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 1 has 69.4M out of 1.11G (6.25%) params' optimizer states
155
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 2 has 69.4M out of 1.11G (6.25%) params' optimizer states
156
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 3 has 69.4M out of 1.11G (6.25%) params' optimizer states
157
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 4 has 69.4M out of 1.11G (6.25%) params' optimizer states
158
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 5 has 69.4M out of 1.11G (6.25%) params' optimizer states
159
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 6 has 69.4M out of 1.11G (6.25%) params' optimizer states
160
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 7 has 69.4M out of 1.11G (6.25%) params' optimizer states
161
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 8 has 69.4M out of 1.11G (6.25%) params' optimizer states
162
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 9 has 69.4M out of 1.11G (6.25%) params' optimizer states
163
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 10 has 69.4M out of 1.11G (6.25%) params' optimizer states
164
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 11 has 69.4M out of 1.11G (6.25%) params' optimizer states
165
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 12 has 69.4M out of 1.11G (6.25%) params' optimizer states
166
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 13 has 69.4M out of 1.11G (6.25%) params' optimizer states
167
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 14 has 69.4M out of 1.11G (6.25%) params' optimizer states
168
+ [default0]:07/02/2024 20:00:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [ZeRO sharding] DP Rank 15 has 69.4M out of 1.11G (6.25%) params' optimizer states
169
+ [default0]:07/02/2024 20:00:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
170
+ [default0]:07/02/2024 20:00:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Using `datasets` library
171
+ [default0]:07/02/2024 20:00:21 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
172
+ [default0]:07/02/2024 20:00:22 [WARNING|DP=0|PP=0|TP=0|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty.
173
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
174
+ [default0]:07/02/2024 20:00:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [Training Plan] There are 1 training stages
175
+ [default0]:07/02/2024 20:00:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [Stage Training Stage] start from step 1
176
+ [default0]:07/02/2024 20:00:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]:
177
+ [default0]:07/02/2024 20:00:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: [Start training] datetime: 2024-07-02 20:00:23.244810 | mbs: 2 | grad_accum: 32 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
178
+ [default0]:07/02/2024 20:00:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
179
+ [default0]:07/02/2024 20:00:23 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 6639.09MiB. Peak allocated 6639.09MiB. Peak reserved: 6892.00MiB
180
+ [default0]:07/02/2024 20:00:23 [WARNING|DP=8|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty.
181
+ [default1]:07/02/2024 20:00:23 [WARNING|DP=9|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty.
182
+ [default3]:07/02/2024 20:00:23 [WARNING|DP=11|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty.
183
+ [default6]:07/02/2024 20:00:23 [WARNING|DP=14|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty.
184
+ [default4]:07/02/2024 20:00:23 [WARNING|DP=12|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty.
185
+ [default5]:07/02/2024 20:00:23 [WARNING|DP=13|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty.
186
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
187
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
188
+ [default7]:07/02/2024 20:00:23 [WARNING|DP=15|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty.
189
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
190
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
191
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
192
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
193
+ [default1]:07/02/2024 20:00:23 [WARNING|DP=1|PP=0|TP=0|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty.
194
+ [default3]:07/02/2024 20:00:23 [WARNING|DP=3|PP=0|TP=0|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty.
195
+ [default4]:07/02/2024 20:00:23 [WARNING|DP=4|PP=0|TP=0|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty.
196
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
197
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
198
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
199
+ [default7]:07/02/2024 20:00:23 [WARNING|DP=7|PP=0|TP=0|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty.
200
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
201
+ [default6]:07/02/2024 20:00:23 [WARNING|DP=6|PP=0|TP=0|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty.
202
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
203
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
204
+ [default2]:07/02/2024 20:00:23 [WARNING|DP=10|PP=0|TP=0|ip-26-0-163-226]: Repo card metadata block was not found. Setting CardData to empty.
205
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
206
+ [default5]:07/02/2024 20:00:23 [WARNING|DP=5|PP=0|TP=0|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty.
207
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
208
+ [default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
209
+ [default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
210
+ [default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
211
+ [default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
212
+ [default2]:07/02/2024 20:00:28 [WARNING|DP=2|PP=0|TP=0|ip-26-0-163-147]: Repo card metadata block was not found. Setting CardData to empty.
213
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
214
+ [default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
215
+ [default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
216
+ [default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
217
+ [default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
218
+ [default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
219
+ [default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
220
+ [default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
221
+ [default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
222
+ [default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
223
+ [default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
224
+ [default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
225
+ [default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
226
+ [default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
227
+ [default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
228
+ [default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
229
+ [default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
230
+ [default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
231
+ [default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
232
+ [default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
233
+ [default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
234
+ [default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
235
+ [default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
236
+ [default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
237
+ [default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
238
+ [default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
239
+ [default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
240
+ [default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
241
+ [default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
242
+ [default0]:07/02/2024 20:00:34 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 6715.25MiB. Peak allocated 24538.23MiB. Peak reserved: 25402.00MiB
243
+ [default0]:07/02/2024 20:00:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 1 / 20 | consumed_tokens: 4.19M | elapsed_time_per_iteration_ms: 18.9K | tokens_per_sec: 222K | tokens_per_sec_per_gpu: 13.9K | global_batch_size: 1.02K | lm_loss: 11.3 | lr: 0.0001 | model_tflops_per_gpu: 126 | hardware_tflops_per_gpu: 126 | grad_norm: 33.1 | cuda_memory_allocated: 7.6G | cuda_max_memory_reserved: 28.9G | hd_total_memory_tb: 312G | hd_used_memory_tb: 68.6G | hd_free_memory_tb: 244G
244
+ [default0]:07/02/2024 20:00:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 11617.74MiB. Peak reserved: 27544.00MiB
245
+ [default0]:07/02/2024 20:00:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.46MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
246
+ [default0]:07/02/2024 20:00:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 2 / 20 | consumed_tokens: 8.39M | elapsed_time_per_iteration_ms: 9.35K | tokens_per_sec: 449K | tokens_per_sec_per_gpu: 28K | global_batch_size: 1.02K | lm_loss: 11.3 | lr: 9.53e-05 | model_tflops_per_gpu: 254 | hardware_tflops_per_gpu: 254 | grad_norm: 33.3 | cuda_memory_allocated: 7.6G | cuda_max_memory_reserved: 28.9G | hd_total_memory_tb: 312G | hd_used_memory_tb: 68.6G | hd_free_memory_tb: 244G
247
+ [default0]:07/02/2024 20:00:51 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 11617.76MiB. Peak reserved: 27568.00MiB
248
+ [default0]:07/02/2024 20:00:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.46MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
249
+ [default0]:07/02/2024 20:01:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 3 / 20 | consumed_tokens: 12.6M | elapsed_time_per_iteration_ms: 9.08K | tokens_per_sec: 462K | tokens_per_sec_per_gpu: 28.9K | global_batch_size: 1.02K | lm_loss: 16 | lr: 9.05e-05 | model_tflops_per_gpu: 262 | hardware_tflops_per_gpu: 262 | grad_norm: 249 | cuda_memory_allocated: 7.6G | cuda_max_memory_reserved: 28.9G | hd_total_memory_tb: 312G | hd_used_memory_tb: 68.6G | hd_free_memory_tb: 244G
250
+ [default0]:07/02/2024 20:01:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 11617.76MiB. Peak reserved: 27568.00MiB
251
+ [default0]:STAGE:2024-07-02 20:01:00 704351:704351 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
252
+ [default0]:07/02/2024 20:01:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.46MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
253
+ [default0]:07/02/2024 20:01:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 4 / 20 | consumed_tokens: 16.8M | elapsed_time_per_iteration_ms: 9.64K | tokens_per_sec: 435K | tokens_per_sec_per_gpu: 27.2K | global_batch_size: 1.02K | lm_loss: 15.1 | lr: 8.58e-05 | model_tflops_per_gpu: 247 | hardware_tflops_per_gpu: 247 | grad_norm: 41.6 | cuda_memory_allocated: 7.6G | cuda_max_memory_reserved: 28.9G | hd_total_memory_tb: 312G | hd_used_memory_tb: 68.6G | hd_free_memory_tb: 244G
254
+ [default0]:07/02/2024 20:01:10 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 11617.76MiB. Peak reserved: 27568.00MiB
255
+ [default0]:07/02/2024 20:01:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 5 / 20 | consumed_tokens: 21M | elapsed_time_per_iteration_ms: 9.5K | tokens_per_sec: 441K | tokens_per_sec_per_gpu: 27.6K | global_batch_size: 1.02K | lm_loss: 10.8 | lr: 8.11e-05 | model_tflops_per_gpu: 250 | hardware_tflops_per_gpu: 250 | grad_norm: 25.9
256
+ [default0]:07/02/2024 20:01:19 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
257
+ [default0]:07/02/2024 20:01:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 6 / 20 | consumed_tokens: 25.2M | elapsed_time_per_iteration_ms: 9.73K | tokens_per_sec: 431K | tokens_per_sec_per_gpu: 26.9K | global_batch_size: 1.02K | lm_loss: 10.8 | lr: 7.63e-05 | model_tflops_per_gpu: 244 | hardware_tflops_per_gpu: 244 | grad_norm: 18.9
258
+ [default0]:STAGE:2024-07-02 20:01:47 704351:704351 ActivityProfilerController.cpp:320] Completed Stage: Collection
259
+ [default0]:STAGE:2024-07-02 20:01:49 704351:704351 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
260
+ [default0]:07/02/2024 20:03:59 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
261
+ [default0]:07/02/2024 20:04:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 7 / 20 | consumed_tokens: 29.4M | elapsed_time_per_iteration_ms: 8.9K | tokens_per_sec: 471K | tokens_per_sec_per_gpu: 29.5K | global_batch_size: 1.02K | lm_loss: 10.2 | lr: 7.16e-05 | model_tflops_per_gpu: 267 | hardware_tflops_per_gpu: 267 | grad_norm: 7.97
262
+ [default0]:07/02/2024 20:04:08 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
263
+ [default0]:07/02/2024 20:04:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 8 / 20 | consumed_tokens: 33.6M | elapsed_time_per_iteration_ms: 8.86K | tokens_per_sec: 473K | tokens_per_sec_per_gpu: 29.6K | global_batch_size: 1.02K | lm_loss: 9.15 | lr: 6.68e-05 | model_tflops_per_gpu: 268 | hardware_tflops_per_gpu: 268 | grad_norm: 6.46
264
+ [default0]:07/02/2024 20:04:17 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
265
+ [default0]:07/02/2024 20:04:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 9 / 20 | consumed_tokens: 37.7M | elapsed_time_per_iteration_ms: 9.08K | tokens_per_sec: 462K | tokens_per_sec_per_gpu: 28.9K | global_batch_size: 1.02K | lm_loss: 11.2 | lr: 6.21e-05 | model_tflops_per_gpu: 262 | hardware_tflops_per_gpu: 262 | grad_norm: 59.7
266
+ [default0]:07/02/2024 20:04:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
267
+ [default0]:07/02/2024 20:04:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 10 / 20 | consumed_tokens: 41.9M | elapsed_time_per_iteration_ms: 9.13K | tokens_per_sec: 459K | tokens_per_sec_per_gpu: 28.7K | global_batch_size: 1.02K | lm_loss: 9.6 | lr: 5.74e-05 | model_tflops_per_gpu: 260 | hardware_tflops_per_gpu: 260 | grad_norm: 44.2
268
+ [default0]:07/02/2024 20:04:35 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
269
+ [default0]:07/02/2024 20:04:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 11 / 20 | consumed_tokens: 46.1M | elapsed_time_per_iteration_ms: 9.39K | tokens_per_sec: 447K | tokens_per_sec_per_gpu: 27.9K | global_batch_size: 1.02K | lm_loss: 8.08 | lr: 5.26e-05 | model_tflops_per_gpu: 253 | hardware_tflops_per_gpu: 253 | grad_norm: 8.69
270
+ [default0]:07/02/2024 20:04:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
271
+ [default0]:07/02/2024 20:04:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 12 / 20 | consumed_tokens: 50.3M | elapsed_time_per_iteration_ms: 8.91K | tokens_per_sec: 471K | tokens_per_sec_per_gpu: 29.4K | global_batch_size: 1.02K | lm_loss: 7.86 | lr: 4.79e-05 | model_tflops_per_gpu: 267 | hardware_tflops_per_gpu: 267 | grad_norm: 5.1
272
+ [default0]:07/02/2024 20:04:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
273
+ [default0]:07/02/2024 20:05:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 13 / 20 | consumed_tokens: 54.5M | elapsed_time_per_iteration_ms: 9.06K | tokens_per_sec: 463K | tokens_per_sec_per_gpu: 28.9K | global_batch_size: 1.02K | lm_loss: 7.7 | lr: 4.32e-05 | model_tflops_per_gpu: 263 | hardware_tflops_per_gpu: 263 | grad_norm: 4.73
274
+ [default0]:07/02/2024 20:05:02 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
275
+ [default0]:07/02/2024 20:05:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 14 / 20 | consumed_tokens: 58.7M | elapsed_time_per_iteration_ms: 9.03K | tokens_per_sec: 464K | tokens_per_sec_per_gpu: 29K | global_batch_size: 1.02K | lm_loss: 7.56 | lr: 3.84e-05 | model_tflops_per_gpu: 263 | hardware_tflops_per_gpu: 263 | grad_norm: 5.09
276
+ [default0]:07/02/2024 20:05:11 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
277
+ [default0]:07/02/2024 20:05:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 15 / 20 | consumed_tokens: 62.9M | elapsed_time_per_iteration_ms: 9.03K | tokens_per_sec: 464K | tokens_per_sec_per_gpu: 29K | global_batch_size: 1.02K | lm_loss: 7.4 | lr: 3.37e-05 | model_tflops_per_gpu: 263 | hardware_tflops_per_gpu: 263 | grad_norm: 5.16
278
+ [default0]:07/02/2024 20:05:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
279
+ [default0]:07/02/2024 20:05:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 16 / 20 | consumed_tokens: 67.1M | elapsed_time_per_iteration_ms: 9.27K | tokens_per_sec: 453K | tokens_per_sec_per_gpu: 28.3K | global_batch_size: 1.02K | lm_loss: 7.3 | lr: 2.89e-05 | model_tflops_per_gpu: 257 | hardware_tflops_per_gpu: 257 | grad_norm: 5.15
280
+ [default0]:07/02/2024 20:05:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
281
+ [default0]:07/02/2024 20:05:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 17 / 20 | consumed_tokens: 71.3M | elapsed_time_per_iteration_ms: 9.24K | tokens_per_sec: 454K | tokens_per_sec_per_gpu: 28.4K | global_batch_size: 1.02K | lm_loss: 7.22 | lr: 2.42e-05 | model_tflops_per_gpu: 258 | hardware_tflops_per_gpu: 258 | grad_norm: 5.14
282
+ [default0]:07/02/2024 20:05:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
283
+ [default0]:07/02/2024 20:05:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 18 / 20 | consumed_tokens: 75.5M | elapsed_time_per_iteration_ms: 9.12K | tokens_per_sec: 460K | tokens_per_sec_per_gpu: 28.8K | global_batch_size: 1.02K | lm_loss: 7.15 | lr: 1.95e-05 | model_tflops_per_gpu: 261 | hardware_tflops_per_gpu: 261 | grad_norm: 5.04
284
+ [default0]:07/02/2024 20:05:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
285
+ [default0]:07/02/2024 20:05:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 19 / 20 | consumed_tokens: 79.7M | elapsed_time_per_iteration_ms: 9.11K | tokens_per_sec: 460K | tokens_per_sec_per_gpu: 28.8K | global_batch_size: 1.02K | lm_loss: 7.08 | lr: 1.47e-05 | model_tflops_per_gpu: 261 | hardware_tflops_per_gpu: 261 | grad_norm: 3.86
286
+ [default0]:07/02/2024 20:05:57 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: Memory usage: 7252.45MiB. Peak allocated 25075.44MiB. Peak reserved: 27568.00MiB
287
+ [default0]:07/02/2024 20:06:06 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-147]: iteration: 20 / 20 | consumed_tokens: 83.9M | elapsed_time_per_iteration_ms: 8.99K | tokens_per_sec: 467K | tokens_per_sec_per_gpu: 29.2K | global_batch_size: 1.02K | lm_loss: 7.03 | lr: 1e-05 | model_tflops_per_gpu: 265 | hardware_tflops_per_gpu: 265 | grad_norm: 2.94
288
  Traceback (most recent call last):
289
+ File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py", line 4, in <module>
290
+ from bench_cluster.submit_jobs import submit_jobs, check_status
291
+ ImportError: cannot import name 'check_status' from 'bench_cluster.submit_jobs' (/fsx/ferdinandmom/ferdinand-hf/bench_cluster/bench_cluster/submit_jobs.py)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  Traceback (most recent call last):
293
+ File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py", line 4, in <module>
294
+ from bench_cluster.submit_jobs import submit_jobs, check_status
295
+ ImportError: cannot import name 'check_status' from 'bench_cluster.submit_jobs' (/fsx/ferdinandmom/ferdinand-hf/bench_cluster/bench_cluster/submit_jobs.py)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
297
+
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2/profiler/ip-26-0-163-147_704351.1719950608420005259.pt.trace.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca10a77173217c1d6842b282b1881e21564f87670ab7e2ad71d5f3b9a24f517f
3
+ size 4331528811
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2/status.txt CHANGED
@@ -1 +1 @@
1
- fail
 
1
+ completed