3outeille HF staff commited on
Commit
8898da0
·
verified ·
1 Parent(s): 9de07c2

Upload llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256

Browse files
llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=02:00:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=1
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=normal
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 1 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256 llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256 --commit-message "Upload llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 1
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 8
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 0
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 4
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 256
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/log.out ADDED
@@ -0,0 +1,675 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Thu Jul 4 02:27:55 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ Already on 'bench_cluster'
10
+ M examples/config_tiny_llama.py
11
+ M examples/config_tiny_llama.yaml
12
+ M examples/train_tiny_llama.sh
13
+ M src/nanotron/models/llama.py
14
+ M src/nanotron/trainer.py
15
+ Your branch is up to date with 'origin/bench_cluster'.
16
+ Job status: RUNNING
17
+ W0704 02:27:58.305000 140315521349440 torch/distributed/run.py:757]
18
+ W0704 02:27:58.305000 140315521349440 torch/distributed/run.py:757] *****************************************
19
+ W0704 02:27:58.305000 140315521349440 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0704 02:27:58.305000 140315521349440 torch/distributed/run.py:757] *****************************************
21
+ [default0]:07/04/2024 02:28:14 [WARNING|DP=0|PP=0|TP=0|ip-26-0-171-88]: [Vocab Size Padding] Padded vocab (size: 50257) with 7 dummy tokens (new size: 50264)
22
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Config:
23
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Config(general=GeneralArgs(project='bench_cluster',
24
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: run='%date_%jobid',
25
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: seed=42,
26
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: step=None,
27
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: consumed_train_samples=None,
28
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: benchmark_csv_path=None,
29
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: ignore_sanity_checks=True),
30
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: parallelism=ParallelismArgs(dp=1,
31
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: pp=1,
32
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tp=8,
33
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7f04e6fd4700>,
34
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
35
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tp_linear_async_communication=False,
36
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: expert_parallel_size=1),
37
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
38
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: eos_token_id=2,
39
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: hidden_act='silu',
40
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: hidden_size=2048,
41
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: initializer_range=0.02,
42
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: intermediate_size=4096,
43
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: is_llama_config=True,
44
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: max_position_embeddings=4096,
45
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_attention_heads=32,
46
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_hidden_layers=24,
47
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_key_value_heads=32,
48
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: pad_token_id=None,
49
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: pretraining_tp=1,
50
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: rms_norm_eps=1e-05,
51
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: rope_scaling=None,
52
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: rope_theta=10000.0,
53
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tie_word_embeddings=True,
54
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: use_cache=True,
55
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: vocab_size=50264),
56
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: init_method=RandomInit(std=0.025),
57
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: dtype=torch.bfloat16,
58
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: make_vocab_size_divisible_by=1,
59
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: ddp_bucket_cap_mb=25),
60
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
61
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tokenizer_revision=None,
62
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tokenizer_max_length=None),
63
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
64
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: checkpoint_interval=100000,
65
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: save_initial_state=False,
66
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: resume_checkpoint_path=None,
67
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: checkpoints_path_is_shared_file_system=False),
68
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: logging=LoggingArgs(log_level='info',
69
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: log_level_replica='info',
70
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: iteration_step_info_interval=1),
71
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tokens=TokensArgs(sequence_length=4096,
72
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: train_steps=20,
73
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: micro_batch_size=256,
74
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: batch_accumulation_per_replica=4,
75
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: val_check_interval=-1,
76
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: limit_val_batches=0,
77
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: limit_test_batches=0),
78
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
79
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: adam_beta1=0.9,
80
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: adam_beta2=0.95,
81
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: torch_adam_is_fused=True,
82
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: name='adamW'),
83
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: zero_stage=1,
84
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: weight_decay=0.01,
85
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: clip_grad=1.0,
86
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: accumulate_grad_in_fp32=True,
87
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
88
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: lr_warmup_steps=1,
89
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: lr_warmup_style='linear',
90
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: lr_decay_style='linear',
91
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: lr_decay_steps=19,
92
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: lr_decay_starting_step=None,
93
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: min_decay_lr=1e-05)),
94
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: data_stages=[DatasetStageArgs(name='Training Stage',
95
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: start_training_step=1,
96
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
97
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: hf_dataset_splits='train',
98
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: hf_dataset_config_name=None,
99
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: dataset_processing_num_proc_per_process=64,
100
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: dataset_overwrite_cache=False,
101
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: text_column_name='text'),
102
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: seed=42,
103
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_loading_workers=0))],
104
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256')),
105
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: lighteval=None)
106
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Model Config:
107
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: LlamaConfig(bos_token_id=1,
108
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: eos_token_id=2,
109
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: hidden_act='silu',
110
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: hidden_size=2048,
111
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: initializer_range=0.02,
112
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: intermediate_size=4096,
113
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: is_llama_config=True,
114
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: max_position_embeddings=4096,
115
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_attention_heads=32,
116
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_hidden_layers=24,
117
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_key_value_heads=32,
118
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: pad_token_id=None,
119
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: pretraining_tp=1,
120
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: rms_norm_eps=1e-05,
121
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: rope_scaling=None,
122
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: rope_theta=10000.0,
123
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tie_word_embeddings=True,
124
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: use_cache=True,
125
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: vocab_size=50264)
126
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Building model..
127
+ [default0]:07/04/2024 02:28:14 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Setting PP block ranks...
128
+ [default4]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=4|ip-26-0-171-88]: Local number of parameters: 139M (264.73MiB)
129
+ [default5]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=5|ip-26-0-171-88]: Local number of parameters: 139M (264.73MiB)
130
+ [default5]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=5|ip-26-0-171-88]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
131
+ [default5]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=5|ip-26-0-171-88]: No checkpoint path provided.
132
+ [default4]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=4|ip-26-0-171-88]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
133
+ [default4]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=4|ip-26-0-171-88]: No checkpoint path provided.
134
+ [default0]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Total number of parameters: 1.11G (2117.88MiB)
135
+ [default0]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Local number of parameters: 139M (264.73MiB)
136
+ [default0]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
137
+ [default0]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: No checkpoint path provided.
138
+ [default0]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Parametrizing model parameters using StandardParametrizator
139
+ [default0]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [Optimizer Building] Using LearningRateForSP as learning rate
140
+ [default0]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [ZeRO sharding] Size of optimizer params per rank:
141
+ [default0]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [ZeRO sharding] DP Rank 0 has 139M out of 139M (100.00%) params' optimizer states
142
+ [default2]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=2|ip-26-0-171-88]: Local number of parameters: 139M (264.73MiB)
143
+ [default2]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=2|ip-26-0-171-88]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
144
+ [default2]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=2|ip-26-0-171-88]: No checkpoint path provided.
145
+ [default7]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=7|ip-26-0-171-88]: Local number of parameters: 139M (264.73MiB)
146
+ [default7]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=7|ip-26-0-171-88]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
147
+ [default7]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=7|ip-26-0-171-88]: No checkpoint path provided.
148
+ [default6]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=6|ip-26-0-171-88]: Local number of parameters: 139M (264.73MiB)
149
+ [default6]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=6|ip-26-0-171-88]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
150
+ [default3]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=3|ip-26-0-171-88]: Local number of parameters: 139M (264.73MiB)
151
+ [default3]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=3|ip-26-0-171-88]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
152
+ [default6]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=6|ip-26-0-171-88]: No checkpoint path provided.
153
+ [default3]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=3|ip-26-0-171-88]: No checkpoint path provided.
154
+ [default1]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=1|ip-26-0-171-88]: Local number of parameters: 139M (264.73MiB)
155
+ [default1]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=1|ip-26-0-171-88]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
156
+ [default1]:07/04/2024 02:28:30 [INFO|DP=0|PP=0|TP=1|ip-26-0-171-88]: No checkpoint path provided.
157
+ [default0]:07/04/2024 02:28:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
158
+ [default0]:07/04/2024 02:28:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Using `datasets` library
159
+ [default0]:07/04/2024 02:28:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
160
+ [default0]:07/04/2024 02:28:31 [WARNING|DP=0|PP=0|TP=0|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
161
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
162
+ [default0]:07/04/2024 02:28:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [Training Plan] There are 1 training stages
163
+ [default0]:07/04/2024 02:28:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [Stage Training Stage] start from step 1
164
+ [default0]:07/04/2024 02:28:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]:
165
+ [default0]:07/04/2024 02:28:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [Start training] datetime: 2024-07-04 02:28:32.629932 | mbs: 256 | grad_accum: 4 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
166
+ [default0]:07/04/2024 02:28:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
167
+ [default0]:07/04/2024 02:28:32 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Memory usage: 1350.75MiB. Peak allocated 1350.76MiB. Peak reserved: 1384.00MiB
168
+ [default4]:07/04/2024 02:28:32 [WARNING|DP=0|PP=0|TP=4|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
169
+ [default5]:07/04/2024 02:28:32 [WARNING|DP=0|PP=0|TP=5|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
170
+ [default6]:07/04/2024 02:28:32 [WARNING|DP=0|PP=0|TP=6|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
171
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
172
+ [default1]:07/04/2024 02:28:32 [WARNING|DP=0|PP=0|TP=1|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
173
+ [default2]:07/04/2024 02:28:32 [WARNING|DP=0|PP=0|TP=2|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
174
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
175
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
176
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
177
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
178
+ [default7]:07/04/2024 02:28:32 [WARNING|DP=0|PP=0|TP=7|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
179
+ [default3]:07/04/2024 02:28:32 [WARNING|DP=0|PP=0|TP=3|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
180
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
181
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
182
+ [default2]:[rank2]: Traceback (most recent call last):
183
+ [default0]:[rank0]: Traceback (most recent call last):
184
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
185
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
186
+ [default2]:[rank2]: trainer.train(dataloader)
187
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
188
+ [default0]:[rank0]: trainer.train(dataloader)
189
+ [default6]:[rank6]: Traceback (most recent call last):
190
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
191
+ [default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
192
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
193
+ [default6]:[rank6]: trainer.train(dataloader)
194
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
195
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
196
+ [default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter(
197
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
198
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
199
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
200
+ [default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
201
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
202
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
203
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
204
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
205
+ [default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter(
206
+ [default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
207
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
208
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
209
+ [default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
210
+ [default0]:[rank0]: output = model(**micro_batch)
211
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
212
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
213
+ [default6]:[rank6]: output = model(**micro_batch)
214
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
215
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
216
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
217
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
218
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
219
+ [default2]:[rank2]: output = model(**micro_batch)
220
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
221
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
222
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
223
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
224
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
225
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
226
+ [default0]:[rank0]: sharded_logits = self.model(
227
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
228
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
229
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
230
+ [default6]:[rank6]: sharded_logits = self.model(
231
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
232
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
233
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
234
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
235
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
236
+ [default2]:[rank2]: sharded_logits = self.model(
237
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
238
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
239
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
240
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
241
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
242
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
243
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
244
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
245
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
246
+ [default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
247
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
248
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
249
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
250
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
251
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
252
+ [default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
253
+ [default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
254
+ [default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
255
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
256
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
257
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
258
+ [default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
259
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
260
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
261
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
262
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
263
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
264
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
265
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
266
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
267
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
268
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
269
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
270
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
271
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
272
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
273
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
274
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
275
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
276
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
277
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
278
+ [default6]:[rank6]: output = self.pp_block(**new_kwargs)
279
+ [default0]:[rank0]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
280
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
281
+ [default2]:[rank2]: output = self.pp_block(**new_kwargs)
282
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
283
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
284
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
285
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
286
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
287
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
288
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
289
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
290
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
291
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
292
+ [default6]:[rank6]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
293
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
294
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
295
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
296
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
297
+ [default7]:[rank7]: Traceback (most recent call last):
298
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
299
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
300
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
301
+ [default2]:[rank2]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
302
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
303
+ [default0]:[rank0]: merged_states = self.gate_up_proj(hidden_states)
304
+ [default7]:[rank7]: trainer.train(dataloader)
305
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
306
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
307
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
308
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
309
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
310
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
311
+ [default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
312
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
313
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
314
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
315
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
316
+ [default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter(
317
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
318
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
319
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
320
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
321
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
322
+ [default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
323
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
324
+ [default0]:[rank0]: return column_linear(
325
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
326
+ [default6]:[rank6]: merged_states = self.gate_up_proj(hidden_states)
327
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
328
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
329
+ [default2]:[rank2]: merged_states = self.gate_up_proj(hidden_states)
330
+ [default0]:[rank0]: return F.linear(input, weight, bias)
331
+ [default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU
332
+ [default7]:[rank7]: output = model(**micro_batch)
333
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
334
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
335
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
336
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
337
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
338
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
339
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
340
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
341
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
342
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
343
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
344
+ [default7]:[rank7]: sharded_logits = self.model(
345
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
346
+ [default6]:[rank6]: return column_linear(
347
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
348
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
349
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
350
+ [default2]:[rank2]: return column_linear(
351
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
352
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
353
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
354
+ [default2]:[rank2]: return F.linear(input, weight, bias)
355
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
356
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
357
+ [default6]:[rank6]: return F.linear(input, weight, bias)
358
+ [default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.71 GiB is free. Including non-PyTorch memory, this process has 77.61 GiB memory in use. Of the allocated memory 64.45 GiB is allocated by PyTorch, and 1.43 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
359
+ [default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
360
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
361
+ [default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
362
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
363
+ [default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.71 GiB is free. Including non-PyTorch memory, this process has 77.61 GiB memory in use. Of the allocated memory 64.45 GiB is allocated by PyTorch, and 1.43 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
364
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
365
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
366
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
367
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
368
+ [default7]:[rank7]: output = self.pp_block(**new_kwargs)
369
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
370
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
371
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
372
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
373
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
374
+ [default7]:[rank7]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
375
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
376
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
377
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
378
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
379
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
380
+ [default7]:[rank7]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
381
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
382
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
383
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
384
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
385
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
386
+ [default7]:[rank7]: return self.act(gate_states) * up_states
387
+ [default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 421.94 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB memory in use. Of the allocated memory 67.45 GiB is allocated by PyTorch, and 436.23 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
388
+ [default1]:[rank1]: Traceback (most recent call last):
389
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
390
+ [default1]:[rank1]: trainer.train(dataloader)
391
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
392
+ [default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
393
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
394
+ [default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter(
395
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
396
+ [default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
397
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
398
+ [default1]:[rank1]: output = model(**micro_batch)
399
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
400
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
401
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
402
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
403
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
404
+ [default1]:[rank1]: sharded_logits = self.model(
405
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
406
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
407
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
408
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
409
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
410
+ [default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
411
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
412
+ [default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
413
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
414
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
415
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
416
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
417
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
418
+ [default1]:[rank1]: output = self.pp_block(**new_kwargs)
419
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
420
+ [default3]:[rank3]: Traceback (most recent call last):
421
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
422
+ [default3]:[rank3]: trainer.train(dataloader)
423
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
424
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
425
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
426
+ [default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
427
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
428
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
429
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
430
+ [default1]:[rank1]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
431
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
432
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
433
+ [default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter(
434
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
435
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
436
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
437
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
438
+ [default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
439
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
440
+ [default1]:[rank1]: merged_states = self.gate_up_proj(hidden_states)
441
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
442
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
443
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
444
+ [default3]:[rank3]: output = model(**micro_batch)
445
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
446
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
447
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
448
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
449
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
450
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
451
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
452
+ [default1]:[rank1]: return column_linear(
453
+ [default3]:[rank3]: sharded_logits = self.model(
454
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
455
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
456
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
457
+ [default1]:[rank1]: return F.linear(input, weight, bias)
458
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
459
+ [default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.71 GiB is free. Including non-PyTorch memory, this process has 77.61 GiB memory in use. Of the allocated memory 64.45 GiB is allocated by PyTorch, and 1.43 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
460
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
461
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
462
+ [default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
463
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
464
+ [default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
465
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
466
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
467
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
468
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
469
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
470
+ [default3]:[rank3]: output = self.pp_block(**new_kwargs)
471
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
472
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
473
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
474
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
475
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
476
+ [default3]:[rank3]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
477
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
478
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
479
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
480
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
481
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
482
+ [default3]:[rank3]: merged_states = self.gate_up_proj(hidden_states)
483
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
484
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
485
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
486
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
487
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
488
+ [default3]:[rank3]: return column_linear(
489
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
490
+ [default3]:[rank3]: return F.linear(input, weight, bias)
491
+ [default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.71 GiB is free. Including non-PyTorch memory, this process has 77.61 GiB memory in use. Of the allocated memory 64.45 GiB is allocated by PyTorch, and 1.43 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
492
+ [default5]:[rank5]: Traceback (most recent call last):
493
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
494
+ [default5]:[rank5]: trainer.train(dataloader)
495
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
496
+ [default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
497
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
498
+ [default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter(
499
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
500
+ [default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
501
+ [default4]:[rank4]: Traceback (most recent call last):
502
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
503
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
504
+ [default5]:[rank5]: output = model(**micro_batch)
505
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
506
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
507
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
508
+ [default4]:[rank4]: trainer.train(dataloader)
509
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
510
+ [default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
511
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
512
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
513
+ [default5]:[rank5]: sharded_logits = self.model(
514
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
515
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
516
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
517
+ [default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter(
518
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
519
+ [default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
520
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
521
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
522
+ [default4]:[rank4]: output = model(**micro_batch)
523
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
524
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
525
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
526
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
527
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
528
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
529
+ [default4]:[rank4]: sharded_logits = self.model(
530
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
531
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
532
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
533
+ [default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
534
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
535
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
536
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
537
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
538
+ [default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
539
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
540
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
541
+ [default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
542
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
543
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
544
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
545
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
546
+ [default5]:[rank5]: output = self.pp_block(**new_kwargs)
547
+ [default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
548
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
549
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
550
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
551
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
552
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
553
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
554
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
555
+ [default4]:[rank4]: output = self.pp_block(**new_kwargs)
556
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
557
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
558
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
559
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
560
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
561
+ [default4]:[rank4]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
562
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
563
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
564
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
565
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
566
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
567
+ [default4]:[rank4]: merged_states = self.gate_up_proj(hidden_states)
568
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
569
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
570
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
571
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
572
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
573
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
574
+ [default4]:[rank4]: return column_linear(
575
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
576
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
577
+ [default4]:[rank4]: return F.linear(input, weight, bias)
578
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
579
+ [default5]:[rank5]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
580
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
581
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
582
+ [default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.71 GiB is free. Including non-PyTorch memory, this process has 77.61 GiB memory in use. Of the allocated memory 64.45 GiB is allocated by PyTorch, and 1.43 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
583
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
584
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
585
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
586
+ [default5]:[rank5]: merged_states = self.gate_up_proj(hidden_states)
587
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
588
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
589
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
590
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
591
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
592
+ [default5]:[rank5]: return column_linear(
593
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
594
+ [default5]:[rank5]: return F.linear(input, weight, bias)
595
+ [default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 1.71 GiB is free. Including non-PyTorch memory, this process has 77.61 GiB memory in use. Of the allocated memory 64.45 GiB is allocated by PyTorch, and 1.43 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
596
+ E0704 02:28:53.525000 140315521349440 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1144719) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
597
+ Traceback (most recent call last):
598
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
599
+ sys.exit(main())
600
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
601
+ return f(*args, **kwargs)
602
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
603
+ run(args)
604
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
605
+ elastic_launch(
606
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
607
+ return launch_agent(self._config, self._entrypoint, list(args))
608
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
609
+ raise ChildFailedError(
610
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
611
+ ============================================================
612
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
613
+ ------------------------------------------------------------
614
+ Failures:
615
+ [1]:
616
+ time : 2024-07-04_02:28:53
617
+ host : ip-26-0-171-88.ec2.internal
618
+ rank : 1 (local_rank: 1)
619
+ exitcode : 1 (pid: 1144720)
620
+ error_file: <N/A>
621
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
622
+ [2]:
623
+ time : 2024-07-04_02:28:53
624
+ host : ip-26-0-171-88.ec2.internal
625
+ rank : 2 (local_rank: 2)
626
+ exitcode : 1 (pid: 1144721)
627
+ error_file: <N/A>
628
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
629
+ [3]:
630
+ time : 2024-07-04_02:28:53
631
+ host : ip-26-0-171-88.ec2.internal
632
+ rank : 3 (local_rank: 3)
633
+ exitcode : 1 (pid: 1144722)
634
+ error_file: <N/A>
635
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
636
+ [4]:
637
+ time : 2024-07-04_02:28:53
638
+ host : ip-26-0-171-88.ec2.internal
639
+ rank : 4 (local_rank: 4)
640
+ exitcode : 1 (pid: 1144723)
641
+ error_file: <N/A>
642
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
643
+ [5]:
644
+ time : 2024-07-04_02:28:53
645
+ host : ip-26-0-171-88.ec2.internal
646
+ rank : 5 (local_rank: 5)
647
+ exitcode : 1 (pid: 1144724)
648
+ error_file: <N/A>
649
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
650
+ [6]:
651
+ time : 2024-07-04_02:28:53
652
+ host : ip-26-0-171-88.ec2.internal
653
+ rank : 6 (local_rank: 6)
654
+ exitcode : 1 (pid: 1144725)
655
+ error_file: <N/A>
656
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
657
+ [7]:
658
+ time : 2024-07-04_02:28:53
659
+ host : ip-26-0-171-88.ec2.internal
660
+ rank : 7 (local_rank: 7)
661
+ exitcode : 1 (pid: 1144726)
662
+ error_file: <N/A>
663
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
664
+ ------------------------------------------------------------
665
+ Root Cause (first observed failure):
666
+ [0]:
667
+ time : 2024-07-04_02:28:53
668
+ host : ip-26-0-171-88.ec2.internal
669
+ rank : 0 (local_rank: 0)
670
+ exitcode : 1 (pid: 1144719)
671
+ error_file: <N/A>
672
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
673
+ ============================================================
674
+ srun: error: ip-26-0-171-88: task 0: Exited with exit code 1
675
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-256/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom