3outeille HF staff commited on
Commit
ae3d2a6
·
verified ·
1 Parent(s): 78e91c0

Upload llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64

Browse files
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=02:00:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=1
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=normal
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 1 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64 llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64 --commit-message "Upload llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 4
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 2
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 0
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 4
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 64
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/log.out ADDED
@@ -0,0 +1,708 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Wed Jul 3 23:36:11 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ Already on 'bench_cluster'
10
+ M examples/config_tiny_llama.py
11
+ M examples/config_tiny_llama.yaml
12
+ M examples/train_tiny_llama.sh
13
+ M src/nanotron/models/llama.py
14
+ M src/nanotron/trainer.py
15
+ Your branch is up to date with 'origin/bench_cluster'.
16
+ Job status: RUNNING
17
+ W0703 23:36:13.742000 139644140926784 torch/distributed/run.py:757]
18
+ W0703 23:36:13.742000 139644140926784 torch/distributed/run.py:757] *****************************************
19
+ W0703 23:36:13.742000 139644140926784 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0703 23:36:13.742000 139644140926784 torch/distributed/run.py:757] *****************************************
21
+ [default0]:07/03/2024 23:36:29 [WARNING|DP=0|PP=0|TP=0|ip-26-0-171-88]: [Vocab Size Padding] Padded vocab (size: 50257) with 1 dummy tokens (new size: 50258)
22
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Config:
23
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Config(general=GeneralArgs(project='bench_cluster',
24
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: run='%date_%jobid',
25
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: seed=42,
26
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: step=None,
27
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: consumed_train_samples=None,
28
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: benchmark_csv_path=None,
29
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: ignore_sanity_checks=True),
30
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: parallelism=ParallelismArgs(dp=4,
31
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: pp=1,
32
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tp=2,
33
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7f715b9308b0>,
34
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
35
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tp_linear_async_communication=False,
36
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: expert_parallel_size=1),
37
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
38
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: eos_token_id=2,
39
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: hidden_act='silu',
40
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: hidden_size=2048,
41
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: initializer_range=0.02,
42
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: intermediate_size=4096,
43
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: is_llama_config=True,
44
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: max_position_embeddings=4096,
45
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_attention_heads=32,
46
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_hidden_layers=24,
47
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_key_value_heads=32,
48
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: pad_token_id=None,
49
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: pretraining_tp=1,
50
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: rms_norm_eps=1e-05,
51
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: rope_scaling=None,
52
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: rope_theta=10000.0,
53
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tie_word_embeddings=True,
54
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: use_cache=True,
55
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: vocab_size=50258),
56
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: init_method=RandomInit(std=0.025),
57
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: dtype=torch.bfloat16,
58
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: make_vocab_size_divisible_by=1,
59
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: ddp_bucket_cap_mb=25),
60
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
61
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tokenizer_revision=None,
62
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tokenizer_max_length=None),
63
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
64
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: checkpoint_interval=100000,
65
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: save_initial_state=False,
66
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: resume_checkpoint_path=None,
67
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: checkpoints_path_is_shared_file_system=False),
68
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: logging=LoggingArgs(log_level='info',
69
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: log_level_replica='info',
70
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: iteration_step_info_interval=1),
71
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tokens=TokensArgs(sequence_length=4096,
72
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: train_steps=20,
73
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: micro_batch_size=64,
74
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: batch_accumulation_per_replica=4,
75
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: val_check_interval=-1,
76
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: limit_val_batches=0,
77
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: limit_test_batches=0),
78
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
79
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: adam_beta1=0.9,
80
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: adam_beta2=0.95,
81
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: torch_adam_is_fused=True,
82
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: name='adamW'),
83
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: zero_stage=1,
84
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: weight_decay=0.01,
85
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: clip_grad=1.0,
86
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: accumulate_grad_in_fp32=True,
87
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
88
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: lr_warmup_steps=1,
89
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: lr_warmup_style='linear',
90
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: lr_decay_style='linear',
91
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: lr_decay_steps=19,
92
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: lr_decay_starting_step=None,
93
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: min_decay_lr=1e-05)),
94
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: data_stages=[DatasetStageArgs(name='Training Stage',
95
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: start_training_step=1,
96
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
97
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: hf_dataset_splits='train',
98
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: hf_dataset_config_name=None,
99
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: dataset_processing_num_proc_per_process=64,
100
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: dataset_overwrite_cache=False,
101
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: text_column_name='text'),
102
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: seed=42,
103
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_loading_workers=0))],
104
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64')),
105
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: lighteval=None)
106
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Model Config:
107
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: LlamaConfig(bos_token_id=1,
108
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: eos_token_id=2,
109
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: hidden_act='silu',
110
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: hidden_size=2048,
111
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: initializer_range=0.02,
112
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: intermediate_size=4096,
113
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: is_llama_config=True,
114
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: max_position_embeddings=4096,
115
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_attention_heads=32,
116
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_hidden_layers=24,
117
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: num_key_value_heads=32,
118
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: pad_token_id=None,
119
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: pretraining_tp=1,
120
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: rms_norm_eps=1e-05,
121
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: rope_scaling=None,
122
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: rope_theta=10000.0,
123
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: tie_word_embeddings=True,
124
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: use_cache=True,
125
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: vocab_size=50258)
126
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Building model..
127
+ [default0]:07/03/2024 23:36:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Setting PP block ranks...
128
+ [default1]:07/03/2024 23:36:40 [INFO|DP=0|PP=0|TP=1|ip-26-0-171-88]: Local number of parameters: 555M (1058.35MiB)
129
+ [default1]:07/03/2024 23:36:40 [INFO|DP=0|PP=0|TP=1|ip-26-0-171-88]: [After model building] Memory usage: 1082.37MiB. Peak allocated: 1182.56MiB Peak reserved: 1200.00MiB
130
+ [default1]:07/03/2024 23:36:40 [INFO|DP=0|PP=0|TP=1|ip-26-0-171-88]: No checkpoint path provided.
131
+ [default0]:07/03/2024 23:36:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Total number of parameters: 1.11G (2116.70MiB)
132
+ [default0]:07/03/2024 23:36:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Local number of parameters: 555M (1058.35MiB)
133
+ [default0]:07/03/2024 23:36:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [After model building] Memory usage: 1082.37MiB. Peak allocated: 1182.56MiB Peak reserved: 1200.00MiB
134
+ [default0]:07/03/2024 23:36:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: No checkpoint path provided.
135
+ [default0]:07/03/2024 23:36:40 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Parametrizing model parameters using StandardParametrizator
136
+ [default5]:07/03/2024 23:36:40 [INFO|DP=2|PP=0|TP=1|ip-26-0-171-88]: No checkpoint path provided.
137
+ [default4]:07/03/2024 23:36:40 [INFO|DP=2|PP=0|TP=0|ip-26-0-171-88]: No checkpoint path provided.
138
+ [default7]:07/03/2024 23:36:40 [INFO|DP=3|PP=0|TP=1|ip-26-0-171-88]: No checkpoint path provided.
139
+ [default3]:07/03/2024 23:36:40 [INFO|DP=1|PP=0|TP=1|ip-26-0-171-88]: No checkpoint path provided.
140
+ [default2]:07/03/2024 23:36:40 [INFO|DP=1|PP=0|TP=0|ip-26-0-171-88]: No checkpoint path provided.
141
+ [default6]:07/03/2024 23:36:40 [INFO|DP=3|PP=0|TP=0|ip-26-0-171-88]: No checkpoint path provided.
142
+ [default0]:07/03/2024 23:36:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [Optimizer Building] Using LearningRateForSP as learning rate
143
+ [default0]:07/03/2024 23:36:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [ZeRO sharding] Size of optimizer params per rank:
144
+ [default0]:07/03/2024 23:36:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [ZeRO sharding] DP Rank 0 has 139M out of 555M (25.00%) params' optimizer states
145
+ [default0]:07/03/2024 23:36:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [ZeRO sharding] DP Rank 1 has 139M out of 555M (25.00%) params' optimizer states
146
+ [default0]:07/03/2024 23:36:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [ZeRO sharding] DP Rank 2 has 139M out of 555M (25.00%) params' optimizer states
147
+ [default0]:07/03/2024 23:36:45 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [ZeRO sharding] DP Rank 3 has 139M out of 555M (25.00%) params' optimizer states
148
+ [default0]:07/03/2024 23:36:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
149
+ [default0]:07/03/2024 23:36:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Using `datasets` library
150
+ [default0]:07/03/2024 23:36:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
151
+ [default0]:07/03/2024 23:36:46 [WARNING|DP=0|PP=0|TP=0|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
152
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
153
+ [default0]:07/03/2024 23:36:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [Training Plan] There are 1 training stages
154
+ [default0]:07/03/2024 23:36:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [Stage Training Stage] start from step 1
155
+ [default0]:07/03/2024 23:36:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]:
156
+ [default0]:07/03/2024 23:36:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: [Start training] datetime: 2024-07-03 23:36:47.224362 | mbs: 64 | grad_accum: 4 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
157
+ [default0]:07/03/2024 23:36:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
158
+ [default0]:07/03/2024 23:36:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-88]: Memory usage: 3729.08MiB. Peak allocated 3729.08MiB. Peak reserved: 3848.00MiB
159
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
160
+ [default3]:07/03/2024 23:36:47 [WARNING|DP=1|PP=0|TP=1|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
161
+ [default2]:07/03/2024 23:36:47 [WARNING|DP=1|PP=0|TP=0|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
162
+ [default4]:07/03/2024 23:36:47 [WARNING|DP=2|PP=0|TP=0|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
163
+ [default6]:07/03/2024 23:36:47 [WARNING|DP=3|PP=0|TP=0|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
164
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
165
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
166
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
167
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
168
+ [default5]:07/03/2024 23:36:47 [WARNING|DP=2|PP=0|TP=1|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
169
+ [default7]:07/03/2024 23:36:47 [WARNING|DP=3|PP=0|TP=1|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
170
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
171
+ [default1]:07/03/2024 23:36:47 [WARNING|DP=0|PP=0|TP=1|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
172
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
173
+ [default4]:[rank4]: Traceback (most recent call last):
174
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
175
+ [default4]:[rank4]: trainer.train(dataloader)
176
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
177
+ [default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
178
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
179
+ [default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter(
180
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
181
+ [default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
182
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
183
+ [default4]:[rank4]: output = model(**micro_batch)
184
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
185
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
186
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
187
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
188
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
189
+ [default4]:[rank4]: sharded_logits = self.model(
190
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
191
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
192
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
193
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
194
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
195
+ [default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
196
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
197
+ [default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
198
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
199
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
200
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
201
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
202
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
203
+ [default4]:[rank4]: output = self.pp_block(**new_kwargs)
204
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
205
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
206
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
207
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
208
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
209
+ [default4]:[rank4]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
210
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
211
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
212
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
213
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
214
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
215
+ [default4]:[rank4]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
216
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
217
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
218
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
219
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
220
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
221
+ [default4]:[rank4]: return self.act(gate_states) * up_states
222
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
223
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
224
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
225
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
226
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
227
+ [default4]:[rank4]: return nn.functional.silu(input)
228
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
229
+ [default4]:[rank4]: return torch._C._nn.silu(input)
230
+ [default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 675.94 MiB is free. Including non-PyTorch memory, this process has 78.66 GiB memory in use. Of the allocated memory 65.78 GiB is allocated by PyTorch, and 1.42 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
231
+ [default5]:[rank5]: Traceback (most recent call last):
232
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
233
+ [default5]:[rank5]: trainer.train(dataloader)
234
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
235
+ [default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
236
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
237
+ [default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter(
238
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
239
+ [default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
240
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
241
+ [default5]:[rank5]: output = model(**micro_batch)
242
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
243
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
244
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
245
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
246
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
247
+ [default5]:[rank5]: sharded_logits = self.model(
248
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
249
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
250
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
251
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
252
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
253
+ [default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
254
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
255
+ [default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
256
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
257
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
258
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
259
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
260
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
261
+ [default5]:[rank5]: output = self.pp_block(**new_kwargs)
262
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
263
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
264
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
265
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
266
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
267
+ [default5]:[rank5]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
268
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
269
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
270
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
271
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
272
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
273
+ [default5]:[rank5]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
274
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
275
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
276
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
277
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
278
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
279
+ [default5]:[rank5]: return self.act(gate_states) * up_states
280
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
281
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
282
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
283
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
284
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
285
+ [default5]:[rank5]: return nn.functional.silu(input)
286
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
287
+ [default5]:[rank5]: return torch._C._nn.silu(input)
288
+ [default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 675.94 MiB is free. Including non-PyTorch memory, this process has 78.66 GiB memory in use. Of the allocated memory 65.78 GiB is allocated by PyTorch, and 1.42 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
289
+ [default3]:[rank3]: Traceback (most recent call last):
290
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
291
+ [default3]:[rank3]: trainer.train(dataloader)
292
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
293
+ [default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
294
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
295
+ [default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter(
296
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
297
+ [default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
298
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
299
+ [default3]:[rank3]: output = model(**micro_batch)
300
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
301
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
302
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
303
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
304
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
305
+ [default3]:[rank3]: sharded_logits = self.model(
306
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
307
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
308
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
309
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
310
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
311
+ [default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
312
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
313
+ [default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
314
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
315
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
316
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
317
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
318
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
319
+ [default3]:[rank3]: output = self.pp_block(**new_kwargs)
320
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
321
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
322
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
323
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
324
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
325
+ [default3]:[rank3]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
326
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
327
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
328
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
329
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
330
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
331
+ [default3]:[rank3]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
332
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
333
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
334
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
335
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
336
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
337
+ [default3]:[rank3]: return self.act(gate_states) * up_states
338
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
339
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
340
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
341
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
342
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
343
+ [default3]:[rank3]: return nn.functional.silu(input)
344
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
345
+ [default3]:[rank3]: return torch._C._nn.silu(input)
346
+ [default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 675.94 MiB is free. Including non-PyTorch memory, this process has 78.66 GiB memory in use. Of the allocated memory 65.78 GiB is allocated by PyTorch, and 1.42 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
347
+ [default6]:[rank6]: Traceback (most recent call last):
348
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
349
+ [default6]:[rank6]: trainer.train(dataloader)
350
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
351
+ [default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
352
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
353
+ [default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter(
354
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
355
+ [default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
356
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
357
+ [default6]:[rank6]: output = model(**micro_batch)
358
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
359
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
360
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
361
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
362
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
363
+ [default6]:[rank6]: sharded_logits = self.model(
364
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
365
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
366
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
367
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
368
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
369
+ [default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
370
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
371
+ [default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
372
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
373
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
374
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
375
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
376
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
377
+ [default6]:[rank6]: output = self.pp_block(**new_kwargs)
378
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
379
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
380
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
381
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
382
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
383
+ [default6]:[rank6]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
384
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
385
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
386
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
387
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
388
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
389
+ [default6]:[rank6]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
390
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
391
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
392
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
393
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
394
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
395
+ [default6]:[rank6]: return self.act(gate_states) * up_states
396
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
397
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
398
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
399
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
400
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
401
+ [default6]:[rank6]: return nn.functional.silu(input)
402
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
403
+ [default6]:[rank6]: return torch._C._nn.silu(input)
404
+ [default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 915.94 MiB is free. Including non-PyTorch memory, this process has 78.42 GiB memory in use. Of the allocated memory 65.78 GiB is allocated by PyTorch, and 1.42 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
405
+ [default2]:[rank2]: Traceback (most recent call last):
406
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
407
+ [default2]:[rank2]: trainer.train(dataloader)
408
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
409
+ [default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
410
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
411
+ [default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter(
412
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
413
+ [default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
414
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
415
+ [default2]:[rank2]: output = model(**micro_batch)
416
+ [default7]:[rank7]: Traceback (most recent call last):
417
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
418
+ [default7]:[rank7]: trainer.train(dataloader)
419
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
420
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
421
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
422
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
423
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
424
+ [default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
425
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
426
+ [default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter(
427
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
428
+ [default2]:[rank2]: sharded_logits = self.model(
429
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
430
+ [default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
431
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
432
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
433
+ [default7]:[rank7]: output = model(**micro_batch)
434
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
435
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
436
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
437
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
438
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
439
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
440
+ [default7]:[rank7]: sharded_logits = self.model(
441
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
442
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
443
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
444
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
445
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
446
+ [default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
447
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
448
+ [default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
449
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
450
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
451
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
452
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
453
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
454
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
455
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
456
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
457
+ [default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
458
+ [default2]:[rank2]: output = self.pp_block(**new_kwargs)
459
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
460
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
461
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
462
+ [default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
463
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
464
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
465
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
466
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
467
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
468
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
469
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
470
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
471
+ [default7]:[rank7]: output = self.pp_block(**new_kwargs)
472
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
473
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
474
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
475
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
476
+ [default2]:[rank2]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
477
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
478
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
479
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
480
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
481
+ [default7]:[rank7]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
482
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
483
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
484
+ [default2]:[rank2]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
485
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
486
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
487
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
488
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
489
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
490
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
491
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
492
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
493
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
494
+ [default2]:[rank2]: return self.act(gate_states) * up_states
495
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
496
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
497
+ [default7]:[rank7]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
498
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
499
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
500
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
501
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
502
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
503
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
504
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
505
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
506
+ [default7]:[rank7]: return self.act(gate_states) * up_states
507
+ [default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 371.94 MiB is free. Including non-PyTorch memory, this process has 78.96 GiB memory in use. Of the allocated memory 66.78 GiB is allocated by PyTorch, and 1.42 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
508
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
509
+ [default2]:[rank2]: return nn.functional.silu(input)
510
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
511
+ [default2]:[rank2]: return torch._C._nn.silu(input)
512
+ [default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 675.94 MiB is free. Including non-PyTorch memory, this process has 78.66 GiB memory in use. Of the allocated memory 65.78 GiB is allocated by PyTorch, and 1.42 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
513
+ [default0]:[rank0]: Traceback (most recent call last):
514
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
515
+ [default0]:[rank0]: trainer.train(dataloader)
516
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
517
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
518
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
519
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
520
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
521
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
522
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
523
+ [default0]:[rank0]: output = model(**micro_batch)
524
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
525
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
526
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
527
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
528
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
529
+ [default0]:[rank0]: sharded_logits = self.model(
530
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
531
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
532
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
533
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
534
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
535
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
536
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
537
+ [default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
538
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
539
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
540
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
541
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
542
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
543
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
544
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
545
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
546
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
547
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
548
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
549
+ [default0]:[rank0]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
550
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
551
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
552
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
553
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
554
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
555
+ [default0]:[rank0]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
556
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
557
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
558
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
559
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
560
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
561
+ [default0]:[rank0]: return self.act(gate_states) * up_states
562
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
563
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
564
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
565
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
566
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
567
+ [default0]:[rank0]: return nn.functional.silu(input)
568
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
569
+ [default0]:[rank0]: return torch._C._nn.silu(input)
570
+ [default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU
571
+ [default1]:[rank1]: Traceback (most recent call last):
572
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
573
+ [default1]:[rank1]: trainer.train(dataloader)
574
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
575
+ [default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
576
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
577
+ [default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter(
578
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
579
+ [default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
580
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
581
+ [default1]:[rank1]: output = model(**micro_batch)
582
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
583
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
584
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
585
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
586
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
587
+ [default1]:[rank1]: sharded_logits = self.model(
588
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
589
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
590
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
591
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
592
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
593
+ [default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
594
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
595
+ [default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
596
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
597
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
598
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
599
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
600
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
601
+ [default1]:[rank1]: output = self.pp_block(**new_kwargs)
602
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
603
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
604
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
605
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
606
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
607
+ [default1]:[rank1]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
608
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
609
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
610
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
611
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
612
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
613
+ [default1]:[rank1]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
614
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
615
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
616
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
617
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
618
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
619
+ [default1]:[rank1]: return self.act(gate_states) * up_states
620
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
621
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
622
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
623
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
624
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
625
+ [default1]:[rank1]: return nn.functional.silu(input)
626
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
627
+ [default1]:[rank1]: return torch._C._nn.silu(input)
628
+ [default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU  has a total capacity of 79.33 GiB of which 723.94 MiB is free. Including non-PyTorch memory, this process has 78.61 GiB memory in use. Of the allocated memory 65.78 GiB is allocated by PyTorch, and 1.42 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
629
+ E0703 23:37:03.996000 139644140926784 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 1104473) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
630
+ Traceback (most recent call last):
631
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
632
+ sys.exit(main())
633
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
634
+ return f(*args, **kwargs)
635
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
636
+ run(args)
637
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
638
+ elastic_launch(
639
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
640
+ return launch_agent(self._config, self._entrypoint, list(args))
641
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
642
+ raise ChildFailedError(
643
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
644
+ ============================================================
645
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
646
+ ------------------------------------------------------------
647
+ Failures:
648
+ [1]:
649
+ time : 2024-07-03_23:37:03
650
+ host : ip-26-0-171-88.ec2.internal
651
+ rank : 1 (local_rank: 1)
652
+ exitcode : 1 (pid: 1104474)
653
+ error_file: <N/A>
654
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
655
+ [2]:
656
+ time : 2024-07-03_23:37:03
657
+ host : ip-26-0-171-88.ec2.internal
658
+ rank : 2 (local_rank: 2)
659
+ exitcode : 1 (pid: 1104475)
660
+ error_file: <N/A>
661
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
662
+ [3]:
663
+ time : 2024-07-03_23:37:03
664
+ host : ip-26-0-171-88.ec2.internal
665
+ rank : 3 (local_rank: 3)
666
+ exitcode : 1 (pid: 1104476)
667
+ error_file: <N/A>
668
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
669
+ [4]:
670
+ time : 2024-07-03_23:37:03
671
+ host : ip-26-0-171-88.ec2.internal
672
+ rank : 4 (local_rank: 4)
673
+ exitcode : 1 (pid: 1104477)
674
+ error_file: <N/A>
675
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
676
+ [5]:
677
+ time : 2024-07-03_23:37:03
678
+ host : ip-26-0-171-88.ec2.internal
679
+ rank : 5 (local_rank: 5)
680
+ exitcode : 1 (pid: 1104478)
681
+ error_file: <N/A>
682
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
683
+ [6]:
684
+ time : 2024-07-03_23:37:03
685
+ host : ip-26-0-171-88.ec2.internal
686
+ rank : 6 (local_rank: 6)
687
+ exitcode : 1 (pid: 1104479)
688
+ error_file: <N/A>
689
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
690
+ [7]:
691
+ time : 2024-07-03_23:37:03
692
+ host : ip-26-0-171-88.ec2.internal
693
+ rank : 7 (local_rank: 7)
694
+ exitcode : 1 (pid: 1104480)
695
+ error_file: <N/A>
696
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
697
+ ------------------------------------------------------------
698
+ Root Cause (first observed failure):
699
+ [0]:
700
+ time : 2024-07-03_23:37:03
701
+ host : ip-26-0-171-88.ec2.internal
702
+ rank : 0 (local_rank: 0)
703
+ exitcode : 1 (pid: 1104473)
704
+ error_file: <N/A>
705
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
706
+ ============================================================
707
+ srun: error: ip-26-0-171-88: task 0: Exited with exit code 1
708
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-64/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom