3outeille HF staff commited on
Commit
5e2919f
·
verified ·
1 Parent(s): 3ff6bc1

Upload llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256

Browse files
llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=02:00:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=1
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=normal
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 1 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256 llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256 --commit-message "Upload llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 2
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 4
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 0
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 2
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 256
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/log.out ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Wed Jul 3 22:52:12 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ Already on 'bench_cluster'
10
+ M examples/config_tiny_llama.py
11
+ M examples/config_tiny_llama.yaml
12
+ M examples/train_tiny_llama.sh
13
+ M src/nanotron/models/llama.py
14
+ M src/nanotron/trainer.py
15
+ Your branch is up to date with 'origin/bench_cluster'.
16
+ Job status: RUNNING
17
+ W0703 22:52:15.049000 139960710256448 torch/distributed/run.py:757]
18
+ W0703 22:52:15.049000 139960710256448 torch/distributed/run.py:757] *****************************************
19
+ W0703 22:52:15.049000 139960710256448 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0703 22:52:15.049000 139960710256448 torch/distributed/run.py:757] *****************************************
21
+ [default0]:07/03/2024 22:52:31 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Vocab Size Padding] Padded vocab (size: 50257) with 3 dummy tokens (new size: 50260)
22
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Config:
23
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Config(general=GeneralArgs(project='bench_cluster',
24
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: run='%date_%jobid',
25
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: seed=42,
26
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: step=None,
27
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: consumed_train_samples=None,
28
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: benchmark_csv_path=None,
29
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: ignore_sanity_checks=True),
30
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: parallelism=ParallelismArgs(dp=2,
31
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pp=1,
32
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tp=4,
33
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7faf893a8820>,
34
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
35
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tp_linear_async_communication=False,
36
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: expert_parallel_size=1),
37
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
38
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: eos_token_id=2,
39
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hidden_act='silu',
40
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hidden_size=2048,
41
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: initializer_range=0.02,
42
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: intermediate_size=4096,
43
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: is_llama_config=True,
44
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: max_position_embeddings=4096,
45
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_attention_heads=32,
46
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_hidden_layers=24,
47
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_key_value_heads=32,
48
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pad_token_id=None,
49
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pretraining_tp=1,
50
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rms_norm_eps=1e-05,
51
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rope_scaling=None,
52
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rope_theta=10000.0,
53
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tie_word_embeddings=True,
54
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: use_cache=True,
55
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: vocab_size=50260),
56
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: init_method=RandomInit(std=0.025),
57
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: dtype=torch.bfloat16,
58
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: make_vocab_size_divisible_by=1,
59
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: ddp_bucket_cap_mb=25),
60
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
61
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tokenizer_revision=None,
62
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tokenizer_max_length=None),
63
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
64
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: checkpoint_interval=100000,
65
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: save_initial_state=False,
66
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: resume_checkpoint_path=None,
67
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: checkpoints_path_is_shared_file_system=False),
68
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: logging=LoggingArgs(log_level='info',
69
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: log_level_replica='info',
70
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: iteration_step_info_interval=1),
71
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tokens=TokensArgs(sequence_length=4096,
72
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: train_steps=20,
73
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: micro_batch_size=256,
74
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: batch_accumulation_per_replica=2,
75
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: val_check_interval=-1,
76
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: limit_val_batches=0,
77
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: limit_test_batches=0),
78
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
79
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: adam_beta1=0.9,
80
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: adam_beta2=0.95,
81
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: torch_adam_is_fused=True,
82
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: name='adamW'),
83
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: zero_stage=1,
84
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: weight_decay=0.01,
85
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: clip_grad=1.0,
86
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: accumulate_grad_in_fp32=True,
87
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
88
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_warmup_steps=1,
89
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_warmup_style='linear',
90
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_decay_style='linear',
91
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_decay_steps=19,
92
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lr_decay_starting_step=None,
93
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: min_decay_lr=1e-05)),
94
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: data_stages=[DatasetStageArgs(name='Training Stage',
95
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: start_training_step=1,
96
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
97
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hf_dataset_splits='train',
98
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hf_dataset_config_name=None,
99
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: dataset_processing_num_proc_per_process=64,
100
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: dataset_overwrite_cache=False,
101
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: text_column_name='text'),
102
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: seed=42,
103
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_loading_workers=0))],
104
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256')),
105
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: lighteval=None)
106
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Model Config:
107
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: LlamaConfig(bos_token_id=1,
108
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: eos_token_id=2,
109
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hidden_act='silu',
110
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: hidden_size=2048,
111
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: initializer_range=0.02,
112
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: intermediate_size=4096,
113
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: is_llama_config=True,
114
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: max_position_embeddings=4096,
115
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_attention_heads=32,
116
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_hidden_layers=24,
117
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: num_key_value_heads=32,
118
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pad_token_id=None,
119
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: pretraining_tp=1,
120
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rms_norm_eps=1e-05,
121
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rope_scaling=None,
122
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: rope_theta=10000.0,
123
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: tie_word_embeddings=True,
124
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: use_cache=True,
125
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: vocab_size=50260)
126
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Building model..
127
+ [default0]:07/03/2024 22:52:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Setting PP block ranks...
128
+ [default4]:07/03/2024 22:52:43 [INFO|DP=1|PP=0|TP=0|ip-26-0-161-178]: No checkpoint path provided.
129
+ [default0]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Total number of parameters: 1.11G (2117.09MiB)
130
+ [default0]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Local number of parameters: 277M (529.27MiB)
131
+ [default0]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
132
+ [default0]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: No checkpoint path provided.
133
+ [default0]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Parametrizing model parameters using StandardParametrizator
134
+ [default1]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-178]: Local number of parameters: 277M (529.27MiB)
135
+ [default1]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-178]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
136
+ [default1]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=1|ip-26-0-161-178]: No checkpoint path provided.
137
+ [default7]:07/03/2024 22:52:43 [INFO|DP=1|PP=0|TP=3|ip-26-0-161-178]: No checkpoint path provided.
138
+ [default5]:07/03/2024 22:52:43 [INFO|DP=1|PP=0|TP=1|ip-26-0-161-178]: No checkpoint path provided.
139
+ [default6]:07/03/2024 22:52:43 [INFO|DP=1|PP=0|TP=2|ip-26-0-161-178]: No checkpoint path provided.
140
+ [default3]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-178]: Local number of parameters: 277M (529.27MiB)
141
+ [default3]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-178]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
142
+ [default3]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=3|ip-26-0-161-178]: No checkpoint path provided.
143
+ [default2]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-178]: Local number of parameters: 277M (529.27MiB)
144
+ [default2]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-178]: [After model building] Memory usage: 554.21MiB. Peak allocated: 606.24MiB Peak reserved: 608.00MiB
145
+ [default2]:07/03/2024 22:52:43 [INFO|DP=0|PP=0|TP=2|ip-26-0-161-178]: No checkpoint path provided.
146
+ [default0]:07/03/2024 22:52:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Optimizer Building] Using LearningRateForSP as learning rate
147
+ [default0]:07/03/2024 22:52:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] Size of optimizer params per rank:
148
+ [default0]:07/03/2024 22:52:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] DP Rank 0 has 139M out of 277M (50.00%) params' optimizer states
149
+ [default0]:07/03/2024 22:52:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [ZeRO sharding] DP Rank 1 has 139M out of 277M (50.00%) params' optimizer states
150
+ [default0]:07/03/2024 22:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
151
+ [default0]:07/03/2024 22:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Using `datasets` library
152
+ [default0]:07/03/2024 22:52:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
153
+ [default0]:07/03/2024 22:52:47 [WARNING|DP=0|PP=0|TP=0|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
154
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
155
+ [default0]:07/03/2024 22:52:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Training Plan] There are 1 training stages
156
+ [default0]:07/03/2024 22:52:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Stage Training Stage] start from step 1
157
+ [default0]:07/03/2024 22:52:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]:
158
+ [default0]:07/03/2024 22:52:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: [Start training] datetime: 2024-07-03 22:52:48.694265 | mbs: 256 | grad_accum: 2 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
159
+ [default0]:07/03/2024 22:52:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
160
+ [default0]:07/03/2024 22:52:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-161-178]: Memory usage: 2142.76MiB. Peak allocated 2142.76MiB. Peak reserved: 2198.00MiB
161
+ [default7]:07/03/2024 22:52:48 [WARNING|DP=1|PP=0|TP=3|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
162
+ [default3]:07/03/2024 22:52:48 [WARNING|DP=0|PP=0|TP=3|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
163
+ [default2]:07/03/2024 22:52:48 [WARNING|DP=0|PP=0|TP=2|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
164
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
165
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
166
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
167
+ [default4]:07/03/2024 22:52:48 [WARNING|DP=1|PP=0|TP=0|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
168
+ [default5]:07/03/2024 22:52:48 [WARNING|DP=1|PP=0|TP=1|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
169
+ [default1]:07/03/2024 22:52:48 [WARNING|DP=0|PP=0|TP=1|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
170
+ [default6]:07/03/2024 22:52:48 [WARNING|DP=1|PP=0|TP=2|ip-26-0-161-178]: Repo card metadata block was not found. Setting CardData to empty.
171
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
172
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
173
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
174
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
175
+ [default6]:[rank6]: Traceback (most recent call last):
176
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
177
+ [default6]:[rank6]: trainer.train(dataloader)
178
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
179
+ [default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
180
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
181
+ [default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter(
182
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
183
+ [default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
184
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
185
+ [default6]:[rank6]: output = model(**micro_batch)
186
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
187
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
188
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
189
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
190
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
191
+ [default6]:[rank6]: sharded_logits = self.model(
192
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
193
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
194
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
195
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
196
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
197
+ [default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
198
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
199
+ [default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
200
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
201
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
202
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
203
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
204
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
205
+ [default6]:[rank6]: output = self.pp_block(**new_kwargs)
206
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
207
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
208
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
209
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
210
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward
211
+ [default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
212
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
213
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
214
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
215
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
216
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 563, in forward
217
+ [default6]:[rank6]: key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0)
218
+ [default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.71 GiB is free. Including non-PyTorch memory, this process has 77.60 GiB memory in use. Of the allocated memory 64.22 GiB is allocated by PyTorch, and 1.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
219
+ [default4]:[rank4]: Traceback (most recent call last):
220
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
221
+ [default4]:[rank4]: trainer.train(dataloader)
222
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
223
+ [default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
224
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
225
+ [default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter(
226
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
227
+ [default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
228
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
229
+ [default4]:[rank4]: output = model(**micro_batch)
230
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
231
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
232
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
233
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
234
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
235
+ [default4]:[rank4]: sharded_logits = self.model(
236
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
237
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
238
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
239
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
240
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
241
+ [default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
242
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
243
+ [default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
244
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
245
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
246
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
247
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
248
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
249
+ [default4]:[rank4]: output = self.pp_block(**new_kwargs)
250
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
251
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
252
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
253
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
254
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward
255
+ [default4]:[rank4]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
256
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
257
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
258
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
259
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
260
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 563, in forward
261
+ [default4]:[rank4]: key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0)
262
+ [default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.76 GiB is free. Including non-PyTorch memory, this process has 77.56 GiB memory in use. Of the allocated memory 64.22 GiB is allocated by PyTorch, and 1.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
263
+ [default5]:[rank5]: Traceback (most recent call last):
264
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
265
+ [default5]:[rank5]: trainer.train(dataloader)
266
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
267
+ [default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
268
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
269
+ [default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter(
270
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
271
+ [default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
272
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
273
+ [default5]:[rank5]: output = model(**micro_batch)
274
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
275
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
276
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
277
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
278
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
279
+ [default5]:[rank5]: sharded_logits = self.model(
280
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
281
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
282
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
283
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
284
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
285
+ [default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
286
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
287
+ [default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
288
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
289
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
290
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
291
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
292
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
293
+ [default5]:[rank5]: output = self.pp_block(**new_kwargs)
294
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
295
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
296
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
297
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
298
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward
299
+ [default5]:[rank5]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
300
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
301
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
302
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
303
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
304
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 563, in forward
305
+ [default5]:[rank5]: key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0)
306
+ [default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.71 GiB is free. Including non-PyTorch memory, this process has 77.60 GiB memory in use. Of the allocated memory 64.22 GiB is allocated by PyTorch, and 1.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
307
+ [default7]:[rank7]: Traceback (most recent call last):
308
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
309
+ [default7]:[rank7]: trainer.train(dataloader)
310
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
311
+ [default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
312
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
313
+ [default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter(
314
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
315
+ [default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
316
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
317
+ [default7]:[rank7]: output = model(**micro_batch)
318
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
319
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
320
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
321
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
322
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
323
+ [default7]:[rank7]: sharded_logits = self.model(
324
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
325
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
326
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
327
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
328
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
329
+ [default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
330
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
331
+ [default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
332
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
333
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
334
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
335
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
336
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
337
+ [default7]:[rank7]: output = self.pp_block(**new_kwargs)
338
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
339
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
340
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
341
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
342
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward
343
+ [default7]:[rank7]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
344
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
345
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
346
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
347
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
348
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 565, in forward
349
+ [default7]:[rank7]: key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous()
350
+ [default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 427.94 MiB is free. Including non-PyTorch memory, this process has 78.90 GiB memory in use. Of the allocated memory 66.22 GiB is allocated by PyTorch, and 1.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
351
+ W0703 22:53:00.240000 139960710256448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1035397 closing signal SIGTERM
352
+ W0703 22:53:00.240000 139960710256448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1035398 closing signal SIGTERM
353
+ W0703 22:53:00.240000 139960710256448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1035399 closing signal SIGTERM
354
+ W0703 22:53:00.242000 139960710256448 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1035400 closing signal SIGTERM
355
+ E0703 22:53:01.554000 139960710256448 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 4 (pid: 1035401) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
356
+ Traceback (most recent call last):
357
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
358
+ sys.exit(main())
359
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
360
+ return f(*args, **kwargs)
361
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
362
+ run(args)
363
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
364
+ elastic_launch(
365
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
366
+ return launch_agent(self._config, self._entrypoint, list(args))
367
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
368
+ raise ChildFailedError(
369
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
370
+ ============================================================
371
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
372
+ ------------------------------------------------------------
373
+ Failures:
374
+ [1]:
375
+ time : 2024-07-03_22:53:00
376
+ host : ip-26-0-161-178.ec2.internal
377
+ rank : 5 (local_rank: 5)
378
+ exitcode : 1 (pid: 1035402)
379
+ error_file: <N/A>
380
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
381
+ [2]:
382
+ time : 2024-07-03_22:53:00
383
+ host : ip-26-0-161-178.ec2.internal
384
+ rank : 6 (local_rank: 6)
385
+ exitcode : 1 (pid: 1035403)
386
+ error_file: <N/A>
387
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
388
+ [3]:
389
+ time : 2024-07-03_22:53:00
390
+ host : ip-26-0-161-178.ec2.internal
391
+ rank : 7 (local_rank: 7)
392
+ exitcode : 1 (pid: 1035404)
393
+ error_file: <N/A>
394
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
395
+ ------------------------------------------------------------
396
+ Root Cause (first observed failure):
397
+ [0]:
398
+ time : 2024-07-03_22:53:00
399
+ host : ip-26-0-161-178.ec2.internal
400
+ rank : 4 (local_rank: 4)
401
+ exitcode : 1 (pid: 1035401)
402
+ error_file: <N/A>
403
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
404
+ ============================================================
405
+ srun: error: ip-26-0-161-178: task 0: Exited with exit code 1
406
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/8_GPUS/dp-2_tp-4_pp-1_mbz-256/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom