3outeille HF staff commited on
Commit
dcfd885
·
verified ·
1 Parent(s): dcd05a3

Upload llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128

Browse files
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=02:00:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=1
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=normal
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 1 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128 llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128 --commit-message "Upload llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 4
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 2
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 0
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 2
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 128
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/log.out ADDED
@@ -0,0 +1,692 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Wed Jul 3 22:50:00 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ Already on 'bench_cluster'
10
+ M examples/config_tiny_llama.py
11
+ M examples/config_tiny_llama.yaml
12
+ M examples/train_tiny_llama.sh
13
+ M src/nanotron/models/llama.py
14
+ M src/nanotron/trainer.py
15
+ Your branch is up to date with 'origin/bench_cluster'.
16
+ Job status: RUNNING
17
+ W0703 22:50:07.857000 140240835700544 torch/distributed/run.py:757]
18
+ W0703 22:50:07.857000 140240835700544 torch/distributed/run.py:757] *****************************************
19
+ W0703 22:50:07.857000 140240835700544 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0703 22:50:07.857000 140240835700544 torch/distributed/run.py:757] *****************************************
21
+ [default0]:07/03/2024 22:50:29 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Vocab Size Padding] Padded vocab (size: 50257) with 1 dummy tokens (new size: 50258)
22
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config:
23
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Config(general=GeneralArgs(project='bench_cluster',
24
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: run='%date_%jobid',
25
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42,
26
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: step=None,
27
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: consumed_train_samples=None,
28
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: benchmark_csv_path=None,
29
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ignore_sanity_checks=True),
30
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: parallelism=ParallelismArgs(dp=4,
31
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp=1,
32
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp=2,
33
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7f4804f50820>,
34
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
35
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tp_linear_async_communication=False,
36
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: expert_parallel_size=1),
37
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
38
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2,
39
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu',
40
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048,
41
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02,
42
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096,
43
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True,
44
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096,
45
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32,
46
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24,
47
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32,
48
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None,
49
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1,
50
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05,
51
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None,
52
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0,
53
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True,
54
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True,
55
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50258),
56
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: init_method=RandomInit(std=0.025),
57
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dtype=torch.bfloat16,
58
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: make_vocab_size_divisible_by=1,
59
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: ddp_bucket_cap_mb=25),
60
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
61
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_revision=None,
62
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokenizer_max_length=None),
63
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
64
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoint_interval=100000,
65
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: save_initial_state=False,
66
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: resume_checkpoint_path=None,
67
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: checkpoints_path_is_shared_file_system=False),
68
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: logging=LoggingArgs(log_level='info',
69
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: log_level_replica='info',
70
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: iteration_step_info_interval=1),
71
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tokens=TokensArgs(sequence_length=4096,
72
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: train_steps=20,
73
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: micro_batch_size=128,
74
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: batch_accumulation_per_replica=2,
75
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: val_check_interval=-1,
76
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_val_batches=0,
77
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: limit_test_batches=0),
78
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
79
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta1=0.9,
80
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: adam_beta2=0.95,
81
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: torch_adam_is_fused=True,
82
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: name='adamW'),
83
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: zero_stage=1,
84
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: weight_decay=0.01,
85
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: clip_grad=1.0,
86
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: accumulate_grad_in_fp32=True,
87
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
88
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_steps=1,
89
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_warmup_style='linear',
90
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_style='linear',
91
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_steps=19,
92
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lr_decay_starting_step=None,
93
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: min_decay_lr=1e-05)),
94
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data_stages=[DatasetStageArgs(name='Training Stage',
95
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: start_training_step=1,
96
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
97
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_splits='train',
98
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hf_dataset_config_name=None,
99
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_processing_num_proc_per_process=64,
100
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: dataset_overwrite_cache=False,
101
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: text_column_name='text'),
102
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: seed=42,
103
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_loading_workers=0))],
104
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128')),
105
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: lighteval=None)
106
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Model Config:
107
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: LlamaConfig(bos_token_id=1,
108
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: eos_token_id=2,
109
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_act='silu',
110
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: hidden_size=2048,
111
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: initializer_range=0.02,
112
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: intermediate_size=4096,
113
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: is_llama_config=True,
114
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: max_position_embeddings=4096,
115
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_attention_heads=32,
116
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_hidden_layers=24,
117
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: num_key_value_heads=32,
118
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pad_token_id=None,
119
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: pretraining_tp=1,
120
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rms_norm_eps=1e-05,
121
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_scaling=None,
122
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: rope_theta=10000.0,
123
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: tie_word_embeddings=True,
124
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: use_cache=True,
125
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: vocab_size=50258)
126
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Building model..
127
+ [default0]:07/03/2024 22:50:29 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Setting PP block ranks...
128
+ [default0]:07/03/2024 22:50:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Total number of parameters: 1.11G (2116.70MiB)
129
+ [default0]:07/03/2024 22:50:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Local number of parameters: 555M (1058.35MiB)
130
+ [default0]:07/03/2024 22:50:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [After model building] Memory usage: 1082.37MiB. Peak allocated: 1182.56MiB Peak reserved: 1200.00MiB
131
+ [default0]:07/03/2024 22:50:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided.
132
+ [default0]:07/03/2024 22:50:39 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Parametrizing model parameters using StandardParametrizator
133
+ [default1]:07/03/2024 22:50:39 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: Local number of parameters: 555M (1058.35MiB)
134
+ [default1]:07/03/2024 22:50:39 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: [After model building] Memory usage: 1082.37MiB. Peak allocated: 1182.56MiB Peak reserved: 1200.00MiB
135
+ [default1]:07/03/2024 22:50:39 [INFO|DP=0|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided.
136
+ [default7]:07/03/2024 22:50:40 [INFO|DP=3|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided.
137
+ [default5]:07/03/2024 22:50:40 [INFO|DP=2|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided.
138
+ [default6]:07/03/2024 22:50:40 [INFO|DP=3|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided.
139
+ [default3]:07/03/2024 22:50:40 [INFO|DP=1|PP=0|TP=1|ip-26-0-160-225]: No checkpoint path provided.
140
+ [default2]:07/03/2024 22:50:40 [INFO|DP=1|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided.
141
+ [default4]:07/03/2024 22:50:40 [INFO|DP=2|PP=0|TP=0|ip-26-0-160-225]: No checkpoint path provided.
142
+ [default0]:07/03/2024 22:50:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Optimizer Building] Using LearningRateForSP as learning rate
143
+ [default0]:07/03/2024 22:50:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] Size of optimizer params per rank:
144
+ [default0]:07/03/2024 22:50:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 0 has 139M out of 555M (25.00%) params' optimizer states
145
+ [default0]:07/03/2024 22:50:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 1 has 139M out of 555M (25.00%) params' optimizer states
146
+ [default0]:07/03/2024 22:50:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 2 has 139M out of 555M (25.00%) params' optimizer states
147
+ [default0]:07/03/2024 22:50:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [ZeRO sharding] DP Rank 3 has 139M out of 555M (25.00%) params' optimizer states
148
+ [default0]:07/03/2024 22:50:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
149
+ [default0]:07/03/2024 22:50:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Using `datasets` library
150
+ [default0]:07/03/2024 22:50:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
151
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
152
+ [default0]:07/03/2024 22:50:46 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
153
+ [default0]:07/03/2024 22:50:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Training Plan] There are 1 training stages
154
+ [default0]:07/03/2024 22:50:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Stage Training Stage] start from step 1
155
+ [default0]:07/03/2024 22:50:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]:
156
+ [default0]:07/03/2024 22:50:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: [Start training] datetime: 2024-07-03 22:50:48.312886 | mbs: 128 | grad_accum: 2 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
157
+ [default0]:07/03/2024 22:50:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
158
+ [default0]:07/03/2024 22:50:48 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-225]: Memory usage: 3729.08MiB. Peak allocated 3729.08MiB. Peak reserved: 3848.00MiB
159
+ [default2]:07/03/2024 22:50:48 [WARNING|DP=1|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
160
+ [default1]:07/03/2024 22:50:48 [WARNING|DP=0|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
161
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
162
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
163
+ [default4]:07/03/2024 22:50:48 [WARNING|DP=2|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
164
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
165
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
166
+ [default7]:07/03/2024 22:50:48 [WARNING|DP=3|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
167
+ [default5]:07/03/2024 22:50:48 [WARNING|DP=2|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
168
+ [default3]:07/03/2024 22:50:48 [WARNING|DP=1|PP=0|TP=1|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
169
+ [default6]:07/03/2024 22:50:48 [WARNING|DP=3|PP=0|TP=0|ip-26-0-160-225]: Repo card metadata block was not found. Setting CardData to empty.
170
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
171
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
172
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
173
+ [default0]:[rank0]: Traceback (most recent call last):
174
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
175
+ [default0]:[rank0]: trainer.train(dataloader)
176
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
177
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
178
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
179
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
180
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
181
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
182
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
183
+ [default0]:[rank0]: output = model(**micro_batch)
184
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
185
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
186
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
187
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
188
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
189
+ [default0]:[rank0]: sharded_logits = self.model(
190
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
191
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
192
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
193
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
194
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
195
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
196
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
197
+ [default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
198
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
199
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
200
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
201
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
202
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
203
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
204
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
205
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
206
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
207
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
208
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
209
+ [default0]:[rank0]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
210
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
211
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
212
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
213
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
214
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
215
+ [default0]:[rank0]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
216
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
217
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
218
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
219
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
220
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
221
+ [default0]:[rank0]: return self.act(gate_states) * up_states
222
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
223
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
224
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
225
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
226
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
227
+ [default0]:[rank0]: return nn.functional.silu(input)
228
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
229
+ [default0]:[rank0]: return torch._C._nn.silu(input)
230
+ [default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU
231
+ [default1]:[rank1]: Traceback (most recent call last):
232
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
233
+ [default1]:[rank1]: trainer.train(dataloader)
234
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
235
+ [default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
236
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
237
+ [default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter(
238
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
239
+ [default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
240
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
241
+ [default1]:[rank1]: output = model(**micro_batch)
242
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
243
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
244
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
245
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
246
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
247
+ [default1]:[rank1]: sharded_logits = self.model(
248
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
249
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
250
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
251
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
252
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
253
+ [default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
254
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
255
+ [default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
256
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
257
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
258
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
259
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
260
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
261
+ [default1]:[rank1]: output = self.pp_block(**new_kwargs)
262
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
263
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
264
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
265
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
266
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
267
+ [default1]:[rank1]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
268
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
269
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
270
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
271
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
272
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
273
+ [default1]:[rank1]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
274
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
275
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
276
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
277
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
278
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
279
+ [default1]:[rank1]: return self.act(gate_states) * up_states
280
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
281
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
282
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
283
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
284
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
285
+ [default1]:[rank1]: return nn.functional.silu(input)
286
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
287
+ [default1]:[rank1]: return torch._C._nn.silu(input)
288
+ [default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 191.94 MiB is free. Including non-PyTorch memory, this process has 79.13 GiB memory in use. Of the allocated memory 64.79 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
289
+ [default4]:[rank4]: Traceback (most recent call last):
290
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
291
+ [default4]:[rank4]: trainer.train(dataloader)
292
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
293
+ [default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
294
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
295
+ [default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter(
296
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
297
+ [default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
298
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
299
+ [default4]:[rank4]: output = model(**micro_batch)
300
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
301
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
302
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
303
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
304
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
305
+ [default4]:[rank4]: sharded_logits = self.model(
306
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
307
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
308
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
309
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
310
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
311
+ [default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
312
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
313
+ [default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
314
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
315
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
316
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
317
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
318
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
319
+ [default4]:[rank4]: output = self.pp_block(**new_kwargs)
320
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
321
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
322
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
323
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
324
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
325
+ [default4]:[rank4]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
326
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
327
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
328
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
329
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
330
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
331
+ [default4]:[rank4]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
332
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
333
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
334
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
335
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
336
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
337
+ [default4]:[rank4]: return self.act(gate_states) * up_states
338
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
339
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
340
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
341
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
342
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
343
+ [default4]:[rank4]: return nn.functional.silu(input)
344
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
345
+ [default4]:[rank4]: return torch._C._nn.silu(input)
346
+ [default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 143.94 MiB is free. Including non-PyTorch memory, this process has 79.18 GiB memory in use. Of the allocated memory 64.79 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
347
+ [default5]:[rank5]: Traceback (most recent call last):
348
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
349
+ [default5]:[rank5]: trainer.train(dataloader)
350
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
351
+ [default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
352
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
353
+ [default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter(
354
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
355
+ [default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
356
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
357
+ [default5]:[rank5]: output = model(**micro_batch)
358
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
359
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
360
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
361
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
362
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
363
+ [default5]:[rank5]: sharded_logits = self.model(
364
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
365
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
366
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
367
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
368
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
369
+ [default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
370
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
371
+ [default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
372
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
373
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
374
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
375
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
376
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
377
+ [default5]:[rank5]: output = self.pp_block(**new_kwargs)
378
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
379
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
380
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
381
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
382
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
383
+ [default5]:[rank5]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
384
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
385
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
386
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
387
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
388
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
389
+ [default5]:[rank5]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
390
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
391
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
392
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
393
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
394
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
395
+ [default5]:[rank5]: return self.act(gate_states) * up_states
396
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
397
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
398
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
399
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
400
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
401
+ [default5]:[rank5]: return nn.functional.silu(input)
402
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
403
+ [default5]:[rank5]: return torch._C._nn.silu(input)
404
+ [default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 143.94 MiB is free. Including non-PyTorch memory, this process has 79.18 GiB memory in use. Of the allocated memory 64.79 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
405
+ [default7]:[rank7]: Traceback (most recent call last):
406
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
407
+ [default7]:[rank7]: trainer.train(dataloader)
408
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
409
+ [default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
410
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
411
+ [default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter(
412
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
413
+ [default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
414
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
415
+ [default7]:[rank7]: output = model(**micro_batch)
416
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
417
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
418
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
419
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
420
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
421
+ [default7]:[rank7]: sharded_logits = self.model(
422
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
423
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
424
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
425
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
426
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
427
+ [default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
428
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
429
+ [default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
430
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
431
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
432
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
433
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
434
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
435
+ [default7]:[rank7]: output = self.pp_block(**new_kwargs)
436
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
437
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
438
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
439
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
440
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
441
+ [default7]:[rank7]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
442
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
443
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
444
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
445
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
446
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
447
+ [default7]:[rank7]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
448
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
449
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
450
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
451
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
452
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
453
+ [default7]:[rank7]: return self.act(gate_states) * up_states
454
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
455
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
456
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
457
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
458
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
459
+ [default7]:[rank7]: return nn.functional.silu(input)
460
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
461
+ [default7]:[rank7]: return torch._C._nn.silu(input)
462
+ [default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 863.94 MiB is free. Including non-PyTorch memory, this process has 78.47 GiB memory in use. Of the allocated memory 64.79 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
463
+ [default2]:[rank2]: Traceback (most recent call last):
464
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
465
+ [default2]:[rank2]: trainer.train(dataloader)
466
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
467
+ [default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
468
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
469
+ [default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter(
470
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
471
+ [default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
472
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
473
+ [default2]:[rank2]: output = model(**micro_batch)
474
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
475
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
476
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
477
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
478
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
479
+ [default2]:[rank2]: sharded_logits = self.model(
480
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
481
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
482
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
483
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
484
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
485
+ [default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
486
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
487
+ [default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
488
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
489
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
490
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
491
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
492
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
493
+ [default2]:[rank2]: output = self.pp_block(**new_kwargs)
494
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
495
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
496
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
497
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
498
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
499
+ [default2]:[rank2]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
500
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
501
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
502
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
503
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
504
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
505
+ [default2]:[rank2]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
506
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
507
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
508
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
509
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
510
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
511
+ [default2]:[rank2]: return self.act(gate_states) * up_states
512
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
513
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
514
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
515
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
516
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
517
+ [default2]:[rank2]: return nn.functional.silu(input)
518
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
519
+ [default2]:[rank2]: return torch._C._nn.silu(input)
520
+ [default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 143.94 MiB is free. Including non-PyTorch memory, this process has 79.18 GiB memory in use. Of the allocated memory 64.79 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
521
+ [default3]:[rank3]: Traceback (most recent call last):
522
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
523
+ [default3]:[rank3]: trainer.train(dataloader)
524
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
525
+ [default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
526
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
527
+ [default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter(
528
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
529
+ [default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
530
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
531
+ [default3]:[rank3]: output = model(**micro_batch)
532
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
533
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
534
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
535
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
536
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
537
+ [default3]:[rank3]: sharded_logits = self.model(
538
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
539
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
540
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
541
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
542
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
543
+ [default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
544
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
545
+ [default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
546
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
547
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
548
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
549
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
550
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
551
+ [default3]:[rank3]: output = self.pp_block(**new_kwargs)
552
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
553
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
554
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
555
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
556
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
557
+ [default3]:[rank3]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
558
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
559
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
560
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
561
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
562
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
563
+ [default3]:[rank3]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
564
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
565
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
566
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
567
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
568
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
569
+ [default3]:[rank3]: return self.act(gate_states) * up_states
570
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
571
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
572
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
573
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
574
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
575
+ [default3]:[rank3]: return nn.functional.silu(input)
576
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
577
+ [default3]:[rank3]: return torch._C._nn.silu(input)
578
+ [default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 143.94 MiB is free. Including non-PyTorch memory, this process has 79.18 GiB memory in use. Of the allocated memory 64.79 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
579
+ [default6]:[rank6]: Traceback (most recent call last):
580
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
581
+ [default6]:[rank6]: trainer.train(dataloader)
582
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
583
+ [default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
584
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
585
+ [default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter(
586
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
587
+ [default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
588
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
589
+ [default6]:[rank6]: output = model(**micro_batch)
590
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
591
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
592
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
593
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
594
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
595
+ [default6]:[rank6]: sharded_logits = self.model(
596
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
597
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
598
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
599
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
600
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
601
+ [default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
602
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
603
+ [default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
604
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
605
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
606
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
607
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
608
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
609
+ [default6]:[rank6]: output = self.pp_block(**new_kwargs)
610
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
611
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
612
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
613
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
614
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
615
+ [default6]:[rank6]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
616
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
617
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
618
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
619
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
620
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
621
+ [default6]:[rank6]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
622
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
623
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
624
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
625
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
626
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
627
+ [default6]:[rank6]: return self.act(gate_states) * up_states
628
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
629
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
630
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
631
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
632
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/activations.py", line 149, in forward
633
+ [default6]:[rank6]: return nn.functional.silu(input)
634
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/functional.py", line 2102, in silu
635
+ [default6]:[rank6]: return torch._C._nn.silu(input)
636
+ [default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU  has a total capacity of 79.33 GiB of which 383.94 MiB is free. Including non-PyTorch memory, this process has 78.94 GiB memory in use. Of the allocated memory 64.79 GiB is allocated by PyTorch, and 2.93 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
637
+ W0703 22:50:57.973000 140240835700544 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 346170 closing signal SIGTERM
638
+ W0703 22:50:57.973000 140240835700544 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 346171 closing signal SIGTERM
639
+ W0703 22:50:57.974000 140240835700544 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 346174 closing signal SIGTERM
640
+ W0703 22:50:57.974000 140240835700544 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 346175 closing signal SIGTERM
641
+ E0703 22:50:59.391000 140240835700544 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 346168) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
642
+ Traceback (most recent call last):
643
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
644
+ sys.exit(main())
645
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
646
+ return f(*args, **kwargs)
647
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
648
+ run(args)
649
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
650
+ elastic_launch(
651
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
652
+ return launch_agent(self._config, self._entrypoint, list(args))
653
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
654
+ raise ChildFailedError(
655
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
656
+ ============================================================
657
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
658
+ ------------------------------------------------------------
659
+ Failures:
660
+ [1]:
661
+ time : 2024-07-03_22:50:57
662
+ host : ip-26-0-160-225.ec2.internal
663
+ rank : 1 (local_rank: 1)
664
+ exitcode : 1 (pid: 346169)
665
+ error_file: <N/A>
666
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
667
+ [2]:
668
+ time : 2024-07-03_22:50:57
669
+ host : ip-26-0-160-225.ec2.internal
670
+ rank : 4 (local_rank: 4)
671
+ exitcode : 1 (pid: 346172)
672
+ error_file: <N/A>
673
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
674
+ [3]:
675
+ time : 2024-07-03_22:50:57
676
+ host : ip-26-0-160-225.ec2.internal
677
+ rank : 5 (local_rank: 5)
678
+ exitcode : 1 (pid: 346173)
679
+ error_file: <N/A>
680
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
681
+ ------------------------------------------------------------
682
+ Root Cause (first observed failure):
683
+ [0]:
684
+ time : 2024-07-03_22:50:57
685
+ host : ip-26-0-160-225.ec2.internal
686
+ rank : 0 (local_rank: 0)
687
+ exitcode : 1 (pid: 346168)
688
+ error_file: <N/A>
689
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
690
+ ============================================================
691
+ srun: error: ip-26-0-160-225: task 0: Exited with exit code 1
692
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/8_GPUS/dp-4_tp-2_pp-1_mbz-128/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom