3outeille HF staff commited on
Commit
6eaccad
·
verified ·
1 Parent(s): 0d3553d

Upload llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512

Browse files
llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=02:00:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=1
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=normal
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 1 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512 llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512 --commit-message "Upload llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 1
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 8
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 0
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 2
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 512
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/log.out ADDED
@@ -0,0 +1,671 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Thu Jul 4 00:05:41 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ fatal: Unable to create '/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/.git/index.lock': File exists.
10
+
11
+ Another git process seems to be running in this repository, e.g.
12
+ an editor opened by 'git commit'. Please make sure all processes
13
+ are terminated then try again. If it still fails, a git process
14
+ may have crashed in this repository earlier:
15
+ remove the file manually to continue.
16
+ Job status: RUNNING
17
+ W0704 00:05:44.480000 140468343650112 torch/distributed/run.py:757]
18
+ W0704 00:05:44.480000 140468343650112 torch/distributed/run.py:757] *****************************************
19
+ W0704 00:05:44.480000 140468343650112 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0704 00:05:44.480000 140468343650112 torch/distributed/run.py:757] *****************************************
21
+ [default0]:07/04/2024 00:06:03 [WARNING|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Vocab Size Padding] Padded vocab (size: 50257) with 7 dummy tokens (new size: 50264)
22
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Config:
23
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Config(general=GeneralArgs(project='bench_cluster',
24
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: run='%date_%jobid',
25
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: seed=42,
26
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: step=None,
27
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: consumed_train_samples=None,
28
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: benchmark_csv_path=None,
29
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: ignore_sanity_checks=True),
30
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: parallelism=ParallelismArgs(dp=1,
31
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pp=1,
32
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tp=8,
33
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7f672bd00880>,
34
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
35
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tp_linear_async_communication=False,
36
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: expert_parallel_size=1),
37
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
38
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: eos_token_id=2,
39
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hidden_act='silu',
40
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hidden_size=2048,
41
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: initializer_range=0.02,
42
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: intermediate_size=4096,
43
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: is_llama_config=True,
44
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: max_position_embeddings=4096,
45
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_attention_heads=32,
46
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_hidden_layers=24,
47
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_key_value_heads=32,
48
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pad_token_id=None,
49
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pretraining_tp=1,
50
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rms_norm_eps=1e-05,
51
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rope_scaling=None,
52
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rope_theta=10000.0,
53
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tie_word_embeddings=True,
54
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: use_cache=True,
55
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: vocab_size=50264),
56
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: init_method=RandomInit(std=0.025),
57
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: dtype=torch.bfloat16,
58
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: make_vocab_size_divisible_by=1,
59
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: ddp_bucket_cap_mb=25),
60
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
61
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tokenizer_revision=None,
62
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tokenizer_max_length=None),
63
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
64
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: checkpoint_interval=100000,
65
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: save_initial_state=False,
66
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: resume_checkpoint_path=None,
67
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: checkpoints_path_is_shared_file_system=False),
68
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: logging=LoggingArgs(log_level='info',
69
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: log_level_replica='info',
70
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: iteration_step_info_interval=1),
71
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tokens=TokensArgs(sequence_length=4096,
72
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: train_steps=20,
73
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: micro_batch_size=512,
74
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: batch_accumulation_per_replica=2,
75
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: val_check_interval=-1,
76
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: limit_val_batches=0,
77
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: limit_test_batches=0),
78
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
79
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: adam_beta1=0.9,
80
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: adam_beta2=0.95,
81
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: torch_adam_is_fused=True,
82
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: name='adamW'),
83
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: zero_stage=1,
84
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: weight_decay=0.01,
85
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: clip_grad=1.0,
86
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: accumulate_grad_in_fp32=True,
87
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
88
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_warmup_steps=1,
89
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_warmup_style='linear',
90
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_decay_style='linear',
91
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_decay_steps=19,
92
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lr_decay_starting_step=None,
93
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: min_decay_lr=1e-05)),
94
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: data_stages=[DatasetStageArgs(name='Training Stage',
95
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: start_training_step=1,
96
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
97
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hf_dataset_splits='train',
98
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hf_dataset_config_name=None,
99
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: dataset_processing_num_proc_per_process=64,
100
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: dataset_overwrite_cache=False,
101
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: text_column_name='text'),
102
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: seed=42,
103
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_loading_workers=0))],
104
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512')),
105
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: lighteval=None)
106
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Model Config:
107
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: LlamaConfig(bos_token_id=1,
108
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: eos_token_id=2,
109
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hidden_act='silu',
110
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: hidden_size=2048,
111
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: initializer_range=0.02,
112
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: intermediate_size=4096,
113
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: is_llama_config=True,
114
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: max_position_embeddings=4096,
115
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_attention_heads=32,
116
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_hidden_layers=24,
117
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: num_key_value_heads=32,
118
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pad_token_id=None,
119
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: pretraining_tp=1,
120
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rms_norm_eps=1e-05,
121
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rope_scaling=None,
122
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: rope_theta=10000.0,
123
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: tie_word_embeddings=True,
124
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: use_cache=True,
125
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: vocab_size=50264)
126
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Building model..
127
+ [default0]:07/04/2024 00:06:03 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Setting PP block ranks...
128
+ [default6]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=6|ip-26-0-164-187]: Local number of parameters: 139M (264.73MiB)
129
+ [default6]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=6|ip-26-0-164-187]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
130
+ [default6]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=6|ip-26-0-164-187]: No checkpoint path provided.
131
+ [default5]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=5|ip-26-0-164-187]: Local number of parameters: 139M (264.73MiB)
132
+ [default5]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=5|ip-26-0-164-187]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
133
+ [default5]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=5|ip-26-0-164-187]: No checkpoint path provided.
134
+ [default0]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Total number of parameters: 1.11G (2117.88MiB)
135
+ [default0]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Local number of parameters: 139M (264.73MiB)
136
+ [default0]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
137
+ [default0]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: No checkpoint path provided.
138
+ [default0]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Parametrizing model parameters using StandardParametrizator
139
+ [default0]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Optimizer Building] Using LearningRateForSP as learning rate
140
+ [default0]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [ZeRO sharding] Size of optimizer params per rank:
141
+ [default0]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [ZeRO sharding] DP Rank 0 has 139M out of 139M (100.00%) params' optimizer states
142
+ [default1]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=1|ip-26-0-164-187]: Local number of parameters: 139M (264.73MiB)
143
+ [default1]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=1|ip-26-0-164-187]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
144
+ [default1]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=1|ip-26-0-164-187]: No checkpoint path provided.
145
+ [default7]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=7|ip-26-0-164-187]: Local number of parameters: 139M (264.73MiB)
146
+ [default7]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=7|ip-26-0-164-187]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
147
+ [default7]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=7|ip-26-0-164-187]: No checkpoint path provided.
148
+ [default3]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=3|ip-26-0-164-187]: Local number of parameters: 139M (264.73MiB)
149
+ [default3]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=3|ip-26-0-164-187]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
150
+ [default3]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=3|ip-26-0-164-187]: No checkpoint path provided.
151
+ [default2]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=2|ip-26-0-164-187]: Local number of parameters: 139M (264.73MiB)
152
+ [default2]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=2|ip-26-0-164-187]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
153
+ [default2]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=2|ip-26-0-164-187]: No checkpoint path provided.
154
+ [default4]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=4|ip-26-0-164-187]: Local number of parameters: 139M (264.73MiB)
155
+ [default4]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=4|ip-26-0-164-187]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
156
+ [default4]:07/04/2024 00:06:18 [INFO|DP=0|PP=0|TP=4|ip-26-0-164-187]: No checkpoint path provided.
157
+ [default0]:07/04/2024 00:06:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
158
+ [default0]:07/04/2024 00:06:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Using `datasets` library
159
+ [default0]:07/04/2024 00:06:20 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
160
+ [default0]:07/04/2024 00:06:20 [WARNING|DP=0|PP=0|TP=0|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
161
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
162
+ [default0]:07/04/2024 00:06:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Training Plan] There are 1 training stages
163
+ [default0]:07/04/2024 00:06:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Stage Training Stage] start from step 1
164
+ [default0]:07/04/2024 00:06:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]:
165
+ [default0]:07/04/2024 00:06:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: [Start training] datetime: 2024-07-04 00:06:22.734159 | mbs: 512 | grad_accum: 2 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
166
+ [default0]:07/04/2024 00:06:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
167
+ [default0]:07/04/2024 00:06:22 [INFO|DP=0|PP=0|TP=0|ip-26-0-164-187]: Memory usage: 1350.75MiB. Peak allocated 1350.76MiB. Peak reserved: 1384.00MiB
168
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
169
+ [default2]:07/04/2024 00:06:22 [WARNING|DP=0|PP=0|TP=2|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
170
+ [default4]:07/04/2024 00:06:22 [WARNING|DP=0|PP=0|TP=4|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
171
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
172
+ [default7]:07/04/2024 00:06:23 [WARNING|DP=0|PP=0|TP=7|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
173
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
174
+ [default6]:07/04/2024 00:06:23 [WARNING|DP=0|PP=0|TP=6|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
175
+ [default1]:07/04/2024 00:06:23 [WARNING|DP=0|PP=0|TP=1|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
176
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
177
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
178
+ [default5]:07/04/2024 00:06:23 [WARNING|DP=0|PP=0|TP=5|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
179
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
180
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
181
+ [default3]:07/04/2024 00:06:23 [WARNING|DP=0|PP=0|TP=3|ip-26-0-164-187]: Repo card metadata block was not found. Setting CardData to empty.
182
+ [default4]:[rank4]: Traceback (most recent call last):
183
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
184
+ [default4]:[rank4]: trainer.train(dataloader)
185
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
186
+ [default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
187
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
188
+ [default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter(
189
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
190
+ [default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
191
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
192
+ [default4]:[rank4]: output = model(**micro_batch)
193
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
194
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
195
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
196
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
197
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
198
+ [default4]:[rank4]: sharded_logits = self.model(
199
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
200
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
201
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
202
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
203
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
204
+ [default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
205
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
206
+ [default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
207
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
208
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
209
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
210
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
211
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
212
+ [default4]:[rank4]: output = self.pp_block(**new_kwargs)
213
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
214
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
215
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
216
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
217
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
218
+ [default4]:[rank4]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
219
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
220
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
221
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
222
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
223
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
224
+ [default4]:[rank4]: merged_states = self.gate_up_proj(hidden_states)
225
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
226
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
227
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
228
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
229
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
230
+ [default4]:[rank4]: return column_linear(
231
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
232
+ [default4]:[rank4]: return F.linear(input, weight, bias)
233
+ [default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.18 GiB is free. Including non-PyTorch memory, this process has 76.13 GiB memory in use. Of the allocated memory 61.47 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
234
+ [default1]:[rank1]: Traceback (most recent call last):
235
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
236
+ [default1]:[rank1]: trainer.train(dataloader)
237
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
238
+ [default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
239
+ [default3]:[rank3]: Traceback (most recent call last):
240
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
241
+ [default3]:[rank3]: trainer.train(dataloader)
242
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
243
+ [default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter(
244
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
245
+ [default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
246
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
247
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
248
+ [default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
249
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
250
+ [default1]:[rank1]: output = model(**micro_batch)
251
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
252
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
253
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
254
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
255
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
256
+ [default1]:[rank1]: sharded_logits = self.model(
257
+ [default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter(
258
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
259
+ [default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
260
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
261
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
262
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
263
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
264
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
265
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
266
+ [default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
267
+ [default3]:[rank3]: output = model(**micro_batch)
268
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
269
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
270
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
271
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
272
+ [default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
273
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
274
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
275
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
276
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
277
+ [default3]:[rank3]: sharded_logits = self.model(
278
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
279
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
280
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
281
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
282
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
283
+ [default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
284
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
285
+ [default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
286
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
287
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
288
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
289
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
290
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
291
+ [default3]:[rank3]: output = self.pp_block(**new_kwargs)
292
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
293
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
294
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
295
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
296
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
297
+ [default3]:[rank3]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
298
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
299
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
300
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
301
+ [default0]:[rank0]: Traceback (most recent call last):
302
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
303
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
304
+ [default3]:[rank3]: merged_states = self.gate_up_proj(hidden_states)
305
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
306
+ [default7]:[rank7]: Traceback (most recent call last):
307
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
308
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
309
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
310
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
311
+ [default7]:[rank7]: trainer.train(dataloader)
312
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
313
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
314
+ [default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
315
+ [default1]:[rank1]: output = self.pp_block(**new_kwargs)
316
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
317
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
318
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
319
+ [default0]:[rank0]: trainer.train(dataloader)
320
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
321
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
322
+ [default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter(
323
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
324
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
325
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
326
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
327
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
328
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
329
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
330
+ [default3]:[rank3]: return column_linear(
331
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
332
+ [default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
333
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
334
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
335
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
336
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
337
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
338
+ [default0]:[rank0]: output = model(**micro_batch)
339
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
340
+ [default7]:[rank7]: output = model(**micro_batch)
341
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
342
+ [default3]:[rank3]: return F.linear(input, weight, bias)
343
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
344
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
345
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
346
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
347
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
348
+ [default1]:[rank1]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
349
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
350
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
351
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
352
+ [default7]:[rank7]: sharded_logits = self.model(
353
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
354
+ [default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.18 GiB is free. Including non-PyTorch memory, this process has 76.13 GiB memory in use. Of the allocated memory 61.47 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
355
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
356
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
357
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
358
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
359
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
360
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
361
+ [default0]:[rank0]: sharded_logits = self.model(
362
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
363
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
364
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
365
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
366
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
367
+ [default1]:[rank1]: merged_states = self.gate_up_proj(hidden_states)
368
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
369
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
370
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
371
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
372
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
373
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
374
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
375
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
376
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
377
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
378
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
379
+ [default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
380
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
381
+ [default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
382
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
383
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
384
+ [default1]:[rank1]: return column_linear(
385
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
386
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
387
+ [default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
388
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
389
+ [default1]:[rank1]: return F.linear(input, weight, bias)
390
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
391
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
392
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
393
+ [default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.18 GiB is free. Including non-PyTorch memory, this process has 76.13 GiB memory in use. Of the allocated memory 61.47 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
394
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
395
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
396
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
397
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
398
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
399
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
400
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
401
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
402
+ [default7]:[rank7]: output = self.pp_block(**new_kwargs)
403
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
404
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
405
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
406
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
407
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
408
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
409
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
410
+ [default7]:[rank7]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
411
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
412
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
413
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
414
+ [default0]:[rank0]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
415
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
416
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
417
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
418
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
419
+ [default7]:[rank7]: merged_states = self.gate_up_proj(hidden_states)
420
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
421
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
422
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
423
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
424
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
425
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
426
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
427
+ [default7]:[rank7]: return column_linear(
428
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
429
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
430
+ [default0]:[rank0]: merged_states = self.gate_up_proj(hidden_states)
431
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
432
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
433
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
434
+ [default7]:[rank7]: return F.linear(input, weight, bias)
435
+ [default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.89 GiB is free. Including non-PyTorch memory, this process has 75.43 GiB memory in use. Of the allocated memory 61.47 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
436
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
437
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
438
+ [default0]:[rank0]: return column_linear(
439
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
440
+ [default0]:[rank0]: return F.linear(input, weight, bias)
441
+ [default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU
442
+ [default6]:[rank6]: Traceback (most recent call last):
443
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
444
+ [default5]:[rank5]: Traceback (most recent call last):
445
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
446
+ [default6]:[rank6]: trainer.train(dataloader)
447
+ [default2]:[rank2]: Traceback (most recent call last):
448
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
449
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
450
+ [default5]:[rank5]: trainer.train(dataloader)
451
+ [default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
452
+ [default2]:[rank2]: trainer.train(dataloader)
453
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
454
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
455
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
456
+ [default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
457
+ [default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
458
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
459
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
460
+ [default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter(
461
+ [default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter(
462
+ [default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter(
463
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
464
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
465
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
466
+ [default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
467
+ [default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
468
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
469
+ [default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
470
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
471
+ [default5]:[rank5]: output = model(**micro_batch)
472
+ [default6]:[rank6]: output = model(**micro_batch)
473
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
474
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
475
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
476
+ [default2]:[rank2]: output = model(**micro_batch)
477
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
478
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
479
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
480
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
481
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
482
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
483
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
484
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
485
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
486
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
487
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
488
+ [default6]:[rank6]: sharded_logits = self.model(
489
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
490
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
491
+ [default5]:[rank5]: sharded_logits = self.model(
492
+ [default2]:[rank2]: sharded_logits = self.model(
493
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
494
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
495
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
496
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
497
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
498
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
499
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
500
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
501
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
502
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
503
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
504
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
505
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
506
+ [default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
507
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
508
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
509
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
510
+ [default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
511
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
512
+ [default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
513
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
514
+ [default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
515
+ [default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
516
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
517
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
518
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
519
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
520
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
521
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
522
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
523
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
524
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
525
+ [default5]:[rank5]: output = self.pp_block(**new_kwargs)
526
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
527
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
528
+ [default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
529
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
530
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
531
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
532
+ [default2]:[rank2]: output = self.pp_block(**new_kwargs)
533
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
534
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
535
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
536
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
537
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
538
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
539
+ [default6]:[rank6]: output = self.pp_block(**new_kwargs)
540
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
541
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
542
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
543
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
544
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
545
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
546
+ [default2]:[rank2]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
547
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
548
+ [default5]:[rank5]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
549
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
550
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
551
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
552
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
553
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
554
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
555
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
556
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
557
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
558
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
559
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
560
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
561
+ [default6]:[rank6]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
562
+ [default2]:[rank2]: merged_states = self.gate_up_proj(hidden_states)
563
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
564
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
565
+ [default5]:[rank5]: merged_states = self.gate_up_proj(hidden_states)
566
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
567
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
568
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
569
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
570
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
571
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
572
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
573
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
574
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
575
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
576
+ [default5]:[rank5]: return column_linear(
577
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
578
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
579
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
580
+ [default5]:[rank5]: return F.linear(input, weight, bias)
581
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
582
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
583
+ [default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.18 GiB is free. Including non-PyTorch memory, this process has 76.13 GiB memory in use. Of the allocated memory 61.47 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
584
+ [default6]:[rank6]: merged_states = self.gate_up_proj(hidden_states)
585
+ [default2]:[rank2]: return column_linear(
586
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
587
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
588
+ [default2]:[rank2]: return F.linear(input, weight, bias)
589
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
590
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
591
+ [default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.18 GiB is free. Including non-PyTorch memory, this process has 76.13 GiB memory in use. Of the allocated memory 61.47 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
592
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
593
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
594
+ [default6]:[rank6]: return column_linear(
595
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
596
+ [default6]:[rank6]: return F.linear(input, weight, bias)
597
+ [default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU  has a total capacity of 79.33 GiB of which 3.18 GiB is free. Including non-PyTorch memory, this process has 76.13 GiB memory in use. Of the allocated memory 61.47 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
598
+ W0704 00:06:34.768000 140468343650112 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 38624 closing signal SIGTERM
599
+ E0704 00:06:35.390000 140468343650112 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 38618) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
600
+ Traceback (most recent call last):
601
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
602
+ sys.exit(main())
603
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
604
+ return f(*args, **kwargs)
605
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
606
+ run(args)
607
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
608
+ elastic_launch(
609
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
610
+ return launch_agent(self._config, self._entrypoint, list(args))
611
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
612
+ raise ChildFailedError(
613
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
614
+ ============================================================
615
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
616
+ ------------------------------------------------------------
617
+ Failures:
618
+ [1]:
619
+ time : 2024-07-04_00:06:34
620
+ host : ip-26-0-164-187.ec2.internal
621
+ rank : 1 (local_rank: 1)
622
+ exitcode : 1 (pid: 38619)
623
+ error_file: <N/A>
624
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
625
+ [2]:
626
+ time : 2024-07-04_00:06:34
627
+ host : ip-26-0-164-187.ec2.internal
628
+ rank : 2 (local_rank: 2)
629
+ exitcode : 1 (pid: 38620)
630
+ error_file: <N/A>
631
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
632
+ [3]:
633
+ time : 2024-07-04_00:06:34
634
+ host : ip-26-0-164-187.ec2.internal
635
+ rank : 3 (local_rank: 3)
636
+ exitcode : 1 (pid: 38621)
637
+ error_file: <N/A>
638
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
639
+ [4]:
640
+ time : 2024-07-04_00:06:34
641
+ host : ip-26-0-164-187.ec2.internal
642
+ rank : 4 (local_rank: 4)
643
+ exitcode : 1 (pid: 38622)
644
+ error_file: <N/A>
645
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
646
+ [5]:
647
+ time : 2024-07-04_00:06:34
648
+ host : ip-26-0-164-187.ec2.internal
649
+ rank : 5 (local_rank: 5)
650
+ exitcode : 1 (pid: 38623)
651
+ error_file: <N/A>
652
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
653
+ [6]:
654
+ time : 2024-07-04_00:06:34
655
+ host : ip-26-0-164-187.ec2.internal
656
+ rank : 7 (local_rank: 7)
657
+ exitcode : 1 (pid: 38625)
658
+ error_file: <N/A>
659
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
660
+ ------------------------------------------------------------
661
+ Root Cause (first observed failure):
662
+ [0]:
663
+ time : 2024-07-04_00:06:34
664
+ host : ip-26-0-164-187.ec2.internal
665
+ rank : 0 (local_rank: 0)
666
+ exitcode : 1 (pid: 38618)
667
+ error_file: <N/A>
668
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
669
+ ============================================================
670
+ srun: error: ip-26-0-164-187: task 0: Exited with exit code 1
671
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-512/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom