3outeille HF staff commited on
Commit
eff8ee1
1 Parent(s): f3bc9a0

Upload llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8

Browse files
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=00:59:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=2
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=high
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 2 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8 llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8 --commit-message "Upload llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 16
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 1
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 32
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 8
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 8
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/log.out ADDED
@@ -0,0 +1,776 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Tue Jul 2 16:30:16 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ Already on 'bench_cluster'
10
+ M examples/config_tiny_llama.py
11
+ M examples/config_tiny_llama.yaml
12
+ M examples/train_tiny_llama.sh
13
+ M src/nanotron/models/llama.py
14
+ M src/nanotron/trainer.py
15
+ Your branch is up to date with 'origin/bench_cluster'.
16
+ Job status: RUNNING
17
+ W0702 16:30:18.958000 139728007939904 torch/distributed/run.py:757]
18
+ W0702 16:30:18.958000 139728007939904 torch/distributed/run.py:757] *****************************************
19
+ W0702 16:30:18.958000 139728007939904 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0702 16:30:18.958000 139728007939904 torch/distributed/run.py:757] *****************************************
21
+ W0702 16:30:18.975000 140288511039296 torch/distributed/run.py:757]
22
+ W0702 16:30:18.975000 140288511039296 torch/distributed/run.py:757] *****************************************
23
+ W0702 16:30:18.975000 140288511039296 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
24
+ W0702 16:30:18.975000 140288511039296 torch/distributed/run.py:757] *****************************************
25
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config:
26
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Config(general=GeneralArgs(project='bench_cluster',
27
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: run='%date_%jobid',
28
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42,
29
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: step=None,
30
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: consumed_train_samples=None,
31
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: benchmark_csv_path=None,
32
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ignore_sanity_checks=True),
33
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: parallelism=ParallelismArgs(dp=16,
34
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp=1,
35
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp=1,
36
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7f2370f98910>,
37
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
38
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tp_linear_async_communication=False,
39
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: expert_parallel_size=1),
40
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
41
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2,
42
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu',
43
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048,
44
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02,
45
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096,
46
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True,
47
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096,
48
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32,
49
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24,
50
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32,
51
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None,
52
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1,
53
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05,
54
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None,
55
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0,
56
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True,
57
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True,
58
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50257),
59
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: init_method=RandomInit(std=0.025),
60
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dtype=torch.bfloat16,
61
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: make_vocab_size_divisible_by=1,
62
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: ddp_bucket_cap_mb=25),
63
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
64
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_revision=None,
65
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokenizer_max_length=None),
66
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
67
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoint_interval=100000,
68
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: save_initial_state=False,
69
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: resume_checkpoint_path=None,
70
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: checkpoints_path_is_shared_file_system=False),
71
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: logging=LoggingArgs(log_level='info',
72
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: log_level_replica='info',
73
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: iteration_step_info_interval=1),
74
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tokens=TokensArgs(sequence_length=4096,
75
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: train_steps=20,
76
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: micro_batch_size=8,
77
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: batch_accumulation_per_replica=8,
78
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: val_check_interval=-1,
79
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_val_batches=0,
80
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: limit_test_batches=0),
81
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
82
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta1=0.9,
83
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: adam_beta2=0.95,
84
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: torch_adam_is_fused=True,
85
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: name='adamW'),
86
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: zero_stage=1,
87
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: weight_decay=0.01,
88
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: clip_grad=1.0,
89
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: accumulate_grad_in_fp32=True,
90
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
91
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_steps=1,
92
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_warmup_style='linear',
93
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_style='linear',
94
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_steps=19,
95
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lr_decay_starting_step=None,
96
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: min_decay_lr=1e-05)),
97
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data_stages=[DatasetStageArgs(name='Training Stage',
98
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: start_training_step=1,
99
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
100
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_splits='train',
101
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hf_dataset_config_name=None,
102
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_processing_num_proc_per_process=64,
103
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: dataset_overwrite_cache=False,
104
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: text_column_name='text'),
105
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: seed=42,
106
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_loading_workers=32))],
107
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8')),
108
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: lighteval=None)
109
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Model Config:
110
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: LlamaConfig(bos_token_id=1,
111
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: eos_token_id=2,
112
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_act='silu',
113
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: hidden_size=2048,
114
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: initializer_range=0.02,
115
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: intermediate_size=4096,
116
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: is_llama_config=True,
117
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: max_position_embeddings=4096,
118
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_attention_heads=32,
119
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_hidden_layers=24,
120
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: num_key_value_heads=32,
121
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pad_token_id=None,
122
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: pretraining_tp=1,
123
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rms_norm_eps=1e-05,
124
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_scaling=None,
125
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: rope_theta=10000.0,
126
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: tie_word_embeddings=True,
127
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: use_cache=True,
128
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: vocab_size=50257)
129
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Building model..
130
+ [default0]:07/02/2024 16:30:36 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Setting PP block ranks...
131
+ [default4]:07/02/2024 16:30:46 [INFO|DP=12|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided.
132
+ [default1]:07/02/2024 16:30:46 [INFO|DP=1|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided.
133
+ [default6]:07/02/2024 16:30:46 [INFO|DP=6|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided.
134
+ [default4]:07/02/2024 16:30:46 [INFO|DP=4|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided.
135
+ [default3]:07/02/2024 16:30:46 [INFO|DP=3|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided.
136
+ [default2]:07/02/2024 16:30:46 [INFO|DP=2|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided.
137
+ [default5]:07/02/2024 16:30:46 [INFO|DP=5|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided.
138
+ [default0]:07/02/2024 16:30:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Total number of parameters: 1.11G (2116.51MiB)
139
+ [default0]:07/02/2024 16:30:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Local number of parameters: 1.11G (2116.51MiB)
140
+ [default0]:07/02/2024 16:30:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [After model building] Memory usage: 2140.53MiB. Peak allocated: 2338.88MiB Peak reserved: 2392.00MiB
141
+ [default0]:07/02/2024 16:30:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided.
142
+ [default0]:07/02/2024 16:30:46 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Parametrizing model parameters using StandardParametrizator
143
+ [default7]:07/02/2024 16:30:45 [INFO|DP=7|PP=0|TP=0|ip-26-0-160-192]: No checkpoint path provided.
144
+ [default7]:07/02/2024 16:30:46 [INFO|DP=15|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided.
145
+ [default0]:07/02/2024 16:30:46 [INFO|DP=8|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided.
146
+ [default1]:07/02/2024 16:30:46 [INFO|DP=9|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided.
147
+ [default2]:07/02/2024 16:30:46 [INFO|DP=10|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided.
148
+ [default5]:07/02/2024 16:30:46 [INFO|DP=13|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided.
149
+ [default3]:07/02/2024 16:30:46 [INFO|DP=11|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided.
150
+ [default6]:07/02/2024 16:30:46 [INFO|DP=14|PP=0|TP=0|ip-26-0-162-233]: No checkpoint path provided.
151
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Optimizer Building] Using LearningRateForSP as learning rate
152
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] Size of optimizer params per rank:
153
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 0 has 69.4M out of 1.11G (6.25%) params' optimizer states
154
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 1 has 69.4M out of 1.11G (6.25%) params' optimizer states
155
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 2 has 69.4M out of 1.11G (6.25%) params' optimizer states
156
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 3 has 69.4M out of 1.11G (6.25%) params' optimizer states
157
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 4 has 69.4M out of 1.11G (6.25%) params' optimizer states
158
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 5 has 69.4M out of 1.11G (6.25%) params' optimizer states
159
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 6 has 69.4M out of 1.11G (6.25%) params' optimizer states
160
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 7 has 69.4M out of 1.11G (6.25%) params' optimizer states
161
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 8 has 69.4M out of 1.11G (6.25%) params' optimizer states
162
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 9 has 69.4M out of 1.11G (6.25%) params' optimizer states
163
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 10 has 69.4M out of 1.11G (6.25%) params' optimizer states
164
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 11 has 69.4M out of 1.11G (6.25%) params' optimizer states
165
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 12 has 69.4M out of 1.11G (6.25%) params' optimizer states
166
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 13 has 69.4M out of 1.11G (6.25%) params' optimizer states
167
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 14 has 69.4M out of 1.11G (6.25%) params' optimizer states
168
+ [default0]:07/02/2024 16:30:54 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [ZeRO sharding] DP Rank 15 has 69.4M out of 1.11G (6.25%) params' optimizer states
169
+ [default0]:07/02/2024 16:30:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
170
+ [default0]:07/02/2024 16:30:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Using `datasets` library
171
+ [default0]:07/02/2024 16:30:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
172
+ [default0]:07/02/2024 16:30:56 [WARNING|DP=0|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty.
173
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
174
+ [default0]:07/02/2024 16:30:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Training Plan] There are 1 training stages
175
+ [default0]:07/02/2024 16:30:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Stage Training Stage] start from step 1
176
+ [default0]:07/02/2024 16:30:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]:
177
+ [default0]:07/02/2024 16:30:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: [Start training] datetime: 2024-07-02 16:30:56.969534 | mbs: 8 | grad_accum: 8 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
178
+ [default0]:07/02/2024 16:30:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
179
+ [default0]:07/02/2024 16:30:56 [INFO|DP=0|PP=0|TP=0|ip-26-0-160-192]: Memory usage: 6639.09MiB. Peak allocated 6639.09MiB. Peak reserved: 6892.00MiB
180
+ [default1]:07/02/2024 16:30:57 [WARNING|DP=9|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty.
181
+ [default7]:07/02/2024 16:30:57 [WARNING|DP=15|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty.
182
+ [default0]:07/02/2024 16:30:57 [WARNING|DP=8|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty.
183
+ [default3]:07/02/2024 16:30:57 [WARNING|DP=11|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty.
184
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
185
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
186
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
187
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
188
+ [default1]:07/02/2024 16:30:57 [WARNING|DP=1|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty.
189
+ [default5]:07/02/2024 16:30:57 [WARNING|DP=5|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty.
190
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
191
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
192
+ [default2]:07/02/2024 16:30:57 [WARNING|DP=10|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty.
193
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
194
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
195
+ [default6]:07/02/2024 16:30:57 [WARNING|DP=14|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty.
196
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
197
+ [default6]:07/02/2024 16:30:57 [WARNING|DP=6|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty.
198
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
199
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
200
+ [default4]:07/02/2024 16:30:57 [WARNING|DP=4|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty.
201
+ [default2]:07/02/2024 16:30:57 [WARNING|DP=2|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty.
202
+ [default3]:07/02/2024 16:30:57 [WARNING|DP=3|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty.
203
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
204
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
205
+ [default5]:07/02/2024 16:30:57 [WARNING|DP=13|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty.
206
+ [default4]:07/02/2024 16:30:57 [WARNING|DP=12|PP=0|TP=0|ip-26-0-162-233]: Repo card metadata block was not found. Setting CardData to empty.
207
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
208
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
209
+ [default7]:07/02/2024 16:30:57 [WARNING|DP=7|PP=0|TP=0|ip-26-0-160-192]: Repo card metadata block was not found. Setting CardData to empty.
210
+ [default0]:[rank0]: Traceback (most recent call last):
211
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
212
+ [default0]:[rank0]: trainer.train(dataloader)
213
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
214
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
215
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
216
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
217
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
218
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
219
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
220
+ [default0]:[rank0]: output = model(**micro_batch)
221
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
222
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
223
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
224
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
225
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
226
+ [default0]:[rank0]: sharded_logits = self.model(
227
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
228
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
229
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
230
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
231
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
232
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
233
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 786, in forward_with_hidden_states
234
+ [default0]:[rank0]: fp32_sharded_logits = self.cast_to_fp32(x=sharded_logits)["output"]
235
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
236
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
237
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
238
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
239
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
240
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
241
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 753, in <lambda>
242
+ [default0]:[rank0]: module_builder=lambda: lambda x: x.float(),
243
+ [default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.14 GiB. GPU
244
+ [default1]:[rank1]: Traceback (most recent call last):
245
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
246
+ [default1]:[rank1]: trainer.train(dataloader)
247
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
248
+ [default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
249
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
250
+ [default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter(
251
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
252
+ [default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
253
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
254
+ [default1]:[rank1]: output = model(**micro_batch)
255
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
256
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
257
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
258
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
259
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
260
+ [default1]:[rank1]: sharded_logits = self.model(
261
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
262
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
263
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
264
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
265
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
266
+ [default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
267
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 786, in forward_with_hidden_states
268
+ [default1]:[rank1]: fp32_sharded_logits = self.cast_to_fp32(x=sharded_logits)["output"]
269
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
270
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
271
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
272
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
273
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
274
+ [default1]:[rank1]: output = self.pp_block(**new_kwargs)
275
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 753, in <lambda>
276
+ [default1]:[rank1]: module_builder=lambda: lambda x: x.float(),
277
+ [default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.14 GiB. GPU  has a total capacity of 79.33 GiB of which 5.28 GiB is free. Including non-PyTorch memory, this process has 74.04 GiB memory in use. Of the allocated memory 66.95 GiB is allocated by PyTorch, and 170.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
278
+ [default1]:[rank9]: OSError: [Errno 122] Disk quota exceeded
279
+ [default1]:
280
+ [default1]:[rank9]: During handling of the above exception, another exception occurred:
281
+ [default1]:
282
+ [default1]:[rank9]: Traceback (most recent call last):
283
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
284
+ [default1]:[rank9]: trainer.train(dataloader)
285
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
286
+ [default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
287
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
288
+ [default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter(
289
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
290
+ [default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
291
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
292
+ [default1]:[rank9]: output = model(**micro_batch)
293
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
294
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
295
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
296
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
297
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
298
+ [default1]:[rank9]: sharded_logits = self.model(
299
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
300
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
301
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
302
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
303
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
304
+ [default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
305
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
306
+ [default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
307
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
308
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
309
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
310
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
311
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
312
+ [default1]:[rank9]: output = self.pp_block(**new_kwargs)
313
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
314
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
315
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
316
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
317
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 629, in forward
318
+ [default1]:[rank9]: hidden_states = self.input_layernorm(hidden_states)
319
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
320
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
321
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
322
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
323
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/layer_norm.py", line 42, in forward
324
+ [default1]:[rank9]: return layer_norm_fn(
325
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 875, in layer_norm_fn
326
+ [default1]:[rank9]: return LayerNormFn.apply(
327
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
328
+ [default1]:[rank9]: return super().apply(*args, **kwargs) # type: ignore[misc]
329
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 748, in forward
330
+ [default1]:[rank9]: y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
331
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 335, in _layer_norm_fwd
332
+ [default1]:[rank9]: _layer_norm_fwd_1pass_kernel[(M,)](
333
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in <lambda>
334
+ [default1]:[rank9]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
335
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in run
336
+ [default1]:[rank9]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
337
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in <dictcomp>
338
+ [default1]:[rank9]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
339
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 122, in _bench
340
+ [default1]:[rank9]: return do_bench(kernel_call, warmup=self.warmup, rep=self.rep, quantiles=(0.5, 0.2, 0.8))
341
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/testing.py", line 102, in do_bench
342
+ [default1]:[rank9]: fn()
343
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 110, in kernel_call
344
+ [default1]:[rank9]: self.fn.run(
345
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
346
+ [default1]:[rank9]: return self.fn.run(*args, **kwargs)
347
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
348
+ [default1]:[rank9]: return self.fn.run(*args, **kwargs)
349
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
350
+ [default1]:[rank9]: return self.fn.run(*args, **kwargs)
351
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run
352
+ [default1]:[rank9]: self.cache[device][key] = compile(
353
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 194, in compile
354
+ [default1]:[rank9]: metadata_group[f"{src.name}.{ext}"] = fn_cache_manager.put(next_module, f"{src.name}.{ext}")
355
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/cache.py", line 123, in put
356
+ [default1]:[rank9]: with open(temp_path, mode) as f:
357
+ [default1]:[rank9]: OSError: [Errno 122] Disk quota exceeded
358
+ [default5]:[rank13]: OSError: [Errno 122] Disk quota exceeded
359
+ [default5]:
360
+ [default5]:[rank13]: During handling of the above exception, another exception occurred:
361
+ [default5]:
362
+ [default5]:[rank13]: Traceback (most recent call last):
363
+ [default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
364
+ [default5]:[rank13]: trainer.train(dataloader)
365
+ [default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
366
+ [default5]:[rank13]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
367
+ [default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
368
+ [default5]:[rank13]: outputs = self.pipeline_engine.train_batch_iter(
369
+ [default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
370
+ [default5]:[rank13]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
371
+ [default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
372
+ [default5]:[rank13]: output = model(**micro_batch)
373
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
374
+ [default5]:[rank13]: return self._call_impl(*args, **kwargs)
375
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
376
+ [default5]:[rank13]: return forward_call(*args, **kwargs)
377
+ [default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
378
+ [default5]:[rank13]: sharded_logits = self.model(
379
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
380
+ [default5]:[rank13]: return self._call_impl(*args, **kwargs)
381
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
382
+ [default5]:[rank13]: return forward_call(*args, **kwargs)
383
+ [default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
384
+ [default5]:[rank13]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
385
+ [default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
386
+ [default5]:[rank13]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
387
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
388
+ [default5]:[rank13]: return self._call_impl(*args, **kwargs)
389
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
390
+ [default5]:[rank13]: return forward_call(*args, **kwargs)
391
+ [default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
392
+ [default5]:[rank13]: output = self.pp_block(**new_kwargs)
393
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
394
+ [default5]:[rank13]: return self._call_impl(*args, **kwargs)
395
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
396
+ [default5]:[rank13]: return forward_call(*args, **kwargs)
397
+ [default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 629, in forward
398
+ [default5]:[rank13]: hidden_states = self.input_layernorm(hidden_states)
399
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
400
+ [default5]:[rank13]: return self._call_impl(*args, **kwargs)
401
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
402
+ [default5]:[rank13]: return forward_call(*args, **kwargs)
403
+ [default5]:[rank13]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/layer_norm.py", line 42, in forward
404
+ [default5]:[rank13]: return layer_norm_fn(
405
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 875, in layer_norm_fn
406
+ [default5]:[rank13]: return LayerNormFn.apply(
407
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
408
+ [default5]:[rank13]: return super().apply(*args, **kwargs) # type: ignore[misc]
409
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 748, in forward
410
+ [default5]:[rank13]: y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
411
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 335, in _layer_norm_fwd
412
+ [default5]:[rank13]: _layer_norm_fwd_1pass_kernel[(M,)](
413
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in <lambda>
414
+ [default5]:[rank13]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
415
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in run
416
+ [default5]:[rank13]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
417
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in <dictcomp>
418
+ [default5]:[rank13]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
419
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 122, in _bench
420
+ [default5]:[rank13]: return do_bench(kernel_call, warmup=self.warmup, rep=self.rep, quantiles=(0.5, 0.2, 0.8))
421
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/testing.py", line 102, in do_bench
422
+ [default5]:[rank13]: fn()
423
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 110, in kernel_call
424
+ [default5]:[rank13]: self.fn.run(
425
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
426
+ [default5]:[rank13]: return self.fn.run(*args, **kwargs)
427
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
428
+ [default5]:[rank13]: return self.fn.run(*args, **kwargs)
429
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
430
+ [default5]:[rank13]: return self.fn.run(*args, **kwargs)
431
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run
432
+ [default5]:[rank13]: self.cache[device][key] = compile(
433
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 194, in compile
434
+ [default5]:[rank13]: metadata_group[f"{src.name}.{ext}"] = fn_cache_manager.put(next_module, f"{src.name}.{ext}")
435
+ [default5]:[rank13]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/cache.py", line 123, in put
436
+ [default5]:[rank13]: with open(temp_path, mode) as f:
437
+ [default5]:[rank13]: OSError: [Errno 122] Disk quota exceeded
438
+ [default2]:[rank10]: Traceback (most recent call last):
439
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
440
+ [default2]:[rank10]: trainer.train(dataloader)
441
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
442
+ [default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
443
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
444
+ [default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter(
445
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
446
+ [default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
447
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
448
+ [default2]:[rank10]: output = model(**micro_batch)
449
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
450
+ [default2]:[rank10]: return self._call_impl(*args, **kwargs)
451
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
452
+ [default2]:[rank10]: return forward_call(*args, **kwargs)
453
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
454
+ [default2]:[rank10]: sharded_logits = self.model(
455
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
456
+ [default2]:[rank10]: return self._call_impl(*args, **kwargs)
457
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
458
+ [default2]:[rank10]: return forward_call(*args, **kwargs)
459
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
460
+ [default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
461
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 786, in forward_with_hidden_states
462
+ [default2]:[rank10]: fp32_sharded_logits = self.cast_to_fp32(x=sharded_logits)["output"]
463
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
464
+ [default2]:[rank10]: return self._call_impl(*args, **kwargs)
465
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
466
+ [default2]:[rank10]: return forward_call(*args, **kwargs)
467
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
468
+ [default2]:[rank10]: output = self.pp_block(**new_kwargs)
469
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 753, in <lambda>
470
+ [default2]:[rank10]: module_builder=lambda: lambda x: x.float(),
471
+ [default2]:[rank10]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.14 GiB. GPU  has a total capacity of 79.33 GiB of which 5.12 GiB is free. Including non-PyTorch memory, this process has 74.20 GiB memory in use. Of the allocated memory 66.95 GiB is allocated by PyTorch, and 170.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
472
+ [default7]:[rank15]: Traceback (most recent call last):
473
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
474
+ [default7]:[rank15]: trainer.train(dataloader)
475
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
476
+ [default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
477
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
478
+ [default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter(
479
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
480
+ [default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
481
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
482
+ [default7]:[rank15]: output = model(**micro_batch)
483
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
484
+ [default7]:[rank15]: return self._call_impl(*args, **kwargs)
485
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
486
+ [default7]:[rank15]: return forward_call(*args, **kwargs)
487
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
488
+ [default7]:[rank15]: sharded_logits = self.model(
489
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
490
+ [default7]:[rank15]: return self._call_impl(*args, **kwargs)
491
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
492
+ [default7]:[rank15]: return forward_call(*args, **kwargs)
493
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
494
+ [default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
495
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 786, in forward_with_hidden_states
496
+ [default7]:[rank15]: fp32_sharded_logits = self.cast_to_fp32(x=sharded_logits)["output"]
497
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
498
+ [default7]:[rank15]: return self._call_impl(*args, **kwargs)
499
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
500
+ [default7]:[rank15]: return forward_call(*args, **kwargs)
501
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
502
+ [default7]:[rank15]: output = self.pp_block(**new_kwargs)
503
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 753, in <lambda>
504
+ [default7]:[rank15]: module_builder=lambda: lambda x: x.float(),
505
+ [default7]:[rank15]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.14 GiB. GPU  has a total capacity of 79.33 GiB of which 4.96 GiB is free. Including non-PyTorch memory, this process has 74.36 GiB memory in use. Of the allocated memory 66.95 GiB is allocated by PyTorch, and 170.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
506
+ [default3]:[rank11]: Traceback (most recent call last):
507
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
508
+ [default3]:[rank11]: trainer.train(dataloader)
509
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
510
+ [default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
511
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
512
+ [default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter(
513
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
514
+ [default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
515
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
516
+ [default3]:[rank11]: output = model(**micro_batch)
517
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
518
+ [default3]:[rank11]: return self._call_impl(*args, **kwargs)
519
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
520
+ [default3]:[rank11]: return forward_call(*args, **kwargs)
521
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
522
+ [default3]:[rank11]: sharded_logits = self.model(
523
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
524
+ [default3]:[rank11]: return self._call_impl(*args, **kwargs)
525
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
526
+ [default3]:[rank11]: return forward_call(*args, **kwargs)
527
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
528
+ [default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
529
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 786, in forward_with_hidden_states
530
+ [default3]:[rank11]: fp32_sharded_logits = self.cast_to_fp32(x=sharded_logits)["output"]
531
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
532
+ [default3]:[rank11]: return self._call_impl(*args, **kwargs)
533
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
534
+ [default3]:[rank11]: return forward_call(*args, **kwargs)
535
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
536
+ [default3]:[rank11]: output = self.pp_block(**new_kwargs)
537
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 753, in <lambda>
538
+ [default3]:[rank11]: module_builder=lambda: lambda x: x.float(),
539
+ [default3]:[rank11]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.14 GiB. GPU  has a total capacity of 79.33 GiB of which 4.88 GiB is free. Including non-PyTorch memory, this process has 74.43 GiB memory in use. Of the allocated memory 66.95 GiB is allocated by PyTorch, and 170.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
540
+ [default0]:[rank8]: OSError: [Errno 122] Disk quota exceeded
541
+ [default0]:
542
+ [default0]:[rank8]: During handling of the above exception, another exception occurred:
543
+ [default0]:
544
+ [default0]:[rank8]: Traceback (most recent call last):
545
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
546
+ [default0]:[rank8]: trainer.train(dataloader)
547
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
548
+ [default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
549
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
550
+ [default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter(
551
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
552
+ [default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
553
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
554
+ [default0]:[rank8]: output = model(**micro_batch)
555
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
556
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
557
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
558
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
559
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
560
+ [default0]:[rank8]: sharded_logits = self.model(
561
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
562
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
563
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
564
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
565
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
566
+ [default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
567
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
568
+ [default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
569
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
570
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
571
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
572
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
573
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
574
+ [default0]:[rank8]: output = self.pp_block(**new_kwargs)
575
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
576
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
577
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
578
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
579
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward
580
+ [default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
581
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
582
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
583
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
584
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
585
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 566, in forward
586
+ [default0]:[rank8]: query_states, key_value_states = self.flash_rotary_embedding(query_states, kv=key_value_states)
587
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
588
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
589
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
590
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
591
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 457, in forward
592
+ [default0]:[rank8]: q = apply_rotary_emb_func(
593
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 122, in apply_rotary_emb
594
+ [default0]:[rank8]: return ApplyRotaryEmb.apply(
595
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
596
+ [default0]:[rank8]: return super().apply(*args, **kwargs) # type: ignore[misc]
597
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 48, in forward
598
+ [default0]:[rank8]: out = apply_rotary(
599
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/rotary.py", line 202, in apply_rotary
600
+ [default0]:[rank8]: rotary_kernel[grid](
601
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in <lambda>
602
+ [default0]:[rank8]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
603
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run
604
+ [default0]:[rank8]: self.cache[device][key] = compile(
605
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 194, in compile
606
+ [default0]:[rank8]: metadata_group[f"{src.name}.{ext}"] = fn_cache_manager.put(next_module, f"{src.name}.{ext}")
607
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/cache.py", line 123, in put
608
+ [default0]:[rank8]: with open(temp_path, mode) as f:
609
+ [default0]:[rank8]: OSError: [Errno 122] Disk quota exceeded
610
+ [default6]:[rank14]: Traceback (most recent call last):
611
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
612
+ [default6]:[rank14]: trainer.train(dataloader)
613
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
614
+ [default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
615
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
616
+ [default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter(
617
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
618
+ [default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
619
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
620
+ [default6]:[rank14]: output = model(**micro_batch)
621
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
622
+ [default6]:[rank14]: return self._call_impl(*args, **kwargs)
623
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
624
+ [default6]:[rank14]: return forward_call(*args, **kwargs)
625
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
626
+ [default6]:[rank14]: sharded_logits = self.model(
627
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
628
+ [default6]:[rank14]: return self._call_impl(*args, **kwargs)
629
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
630
+ [default6]:[rank14]: return forward_call(*args, **kwargs)
631
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
632
+ [default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
633
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 786, in forward_with_hidden_states
634
+ [default6]:[rank14]: fp32_sharded_logits = self.cast_to_fp32(x=sharded_logits)["output"]
635
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
636
+ [default6]:[rank14]: return self._call_impl(*args, **kwargs)
637
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
638
+ [default6]:[rank14]: return forward_call(*args, **kwargs)
639
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
640
+ [default6]:[rank14]: output = self.pp_block(**new_kwargs)
641
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 753, in <lambda>
642
+ [default6]:[rank14]: module_builder=lambda: lambda x: x.float(),
643
+ [default6]:[rank14]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.14 GiB. GPU  has a total capacity of 79.33 GiB of which 4.88 GiB is free. Including non-PyTorch memory, this process has 74.43 GiB memory in use. Of the allocated memory 66.95 GiB is allocated by PyTorch, and 170.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
644
+ [default4]:[rank12]: Traceback (most recent call last):
645
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
646
+ [default4]:[rank12]: trainer.train(dataloader)
647
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
648
+ [default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
649
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
650
+ [default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter(
651
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
652
+ [default4]:[rank12]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
653
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
654
+ [default4]:[rank12]: output = model(**micro_batch)
655
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
656
+ [default4]:[rank12]: return self._call_impl(*args, **kwargs)
657
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
658
+ [default4]:[rank12]: return forward_call(*args, **kwargs)
659
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
660
+ [default4]:[rank12]: sharded_logits = self.model(
661
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
662
+ [default4]:[rank12]: return self._call_impl(*args, **kwargs)
663
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
664
+ [default4]:[rank12]: return forward_call(*args, **kwargs)
665
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
666
+ [default4]:[rank12]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
667
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 786, in forward_with_hidden_states
668
+ [default4]:[rank12]: fp32_sharded_logits = self.cast_to_fp32(x=sharded_logits)["output"]
669
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
670
+ [default4]:[rank12]: return self._call_impl(*args, **kwargs)
671
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
672
+ [default4]:[rank12]: return forward_call(*args, **kwargs)
673
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
674
+ [default4]:[rank12]: output = self.pp_block(**new_kwargs)
675
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 753, in <lambda>
676
+ [default4]:[rank12]: module_builder=lambda: lambda x: x.float(),
677
+ [default4]:[rank12]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.14 GiB. GPU  has a total capacity of 79.33 GiB of which 4.88 GiB is free. Including non-PyTorch memory, this process has 74.43 GiB memory in use. Of the allocated memory 66.95 GiB is allocated by PyTorch, and 170.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
678
+ W0702 16:31:05.158000 139728007939904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 867663 closing signal SIGTERM
679
+ W0702 16:31:05.164000 139728007939904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 867664 closing signal SIGTERM
680
+ W0702 16:31:05.161000 140288511039296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1215877 closing signal SIGTERM
681
+ W0702 16:31:05.161000 140288511039296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1215878 closing signal SIGTERM
682
+ W0702 16:31:05.161000 140288511039296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1215880 closing signal SIGTERM
683
+ W0702 16:31:05.162000 140288511039296 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 1215882 closing signal SIGTERM
684
+ W0702 16:31:05.169000 139728007939904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 867665 closing signal SIGTERM
685
+ W0702 16:31:05.172000 139728007939904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 867666 closing signal SIGTERM
686
+ W0702 16:31:05.177000 139728007939904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 867667 closing signal SIGTERM
687
+ W0702 16:31:05.185000 139728007939904 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 867668 closing signal SIGTERM
688
+ E0702 16:31:06.175000 140288511039296 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 2 (pid: 1215879) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
689
+ Traceback (most recent call last):
690
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
691
+ sys.exit(main())
692
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
693
+ return f(*args, **kwargs)
694
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
695
+ run(args)
696
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
697
+ elastic_launch(
698
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
699
+ return launch_agent(self._config, self._entrypoint, list(args))
700
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
701
+ raise ChildFailedError(
702
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
703
+ ============================================================
704
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
705
+ ------------------------------------------------------------
706
+ Failures:
707
+ [1]:
708
+ time : 2024-07-02_16:31:05
709
+ host : ip-26-0-162-233.ec2.internal
710
+ rank : 12 (local_rank: 4)
711
+ exitcode : 1 (pid: 1215881)
712
+ error_file: <N/A>
713
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
714
+ [2]:
715
+ time : 2024-07-02_16:31:05
716
+ host : ip-26-0-162-233.ec2.internal
717
+ rank : 14 (local_rank: 6)
718
+ exitcode : 1 (pid: 1215883)
719
+ error_file: <N/A>
720
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
721
+ [3]:
722
+ time : 2024-07-02_16:31:05
723
+ host : ip-26-0-162-233.ec2.internal
724
+ rank : 15 (local_rank: 7)
725
+ exitcode : 1 (pid: 1215884)
726
+ error_file: <N/A>
727
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
728
+ ------------------------------------------------------------
729
+ Root Cause (first observed failure):
730
+ [0]:
731
+ time : 2024-07-02_16:31:05
732
+ host : ip-26-0-162-233.ec2.internal
733
+ rank : 10 (local_rank: 2)
734
+ exitcode : 1 (pid: 1215879)
735
+ error_file: <N/A>
736
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
737
+ ============================================================
738
+ srun: error: ip-26-0-162-233: task 1: Exited with exit code 1
739
+ E0702 16:31:07.108000 139728007939904 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 867661) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
740
+ Traceback (most recent call last):
741
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
742
+ sys.exit(main())
743
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
744
+ return f(*args, **kwargs)
745
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
746
+ run(args)
747
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
748
+ elastic_launch(
749
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
750
+ return launch_agent(self._config, self._entrypoint, list(args))
751
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
752
+ raise ChildFailedError(
753
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
754
+ ============================================================
755
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
756
+ ------------------------------------------------------------
757
+ Failures:
758
+ [1]:
759
+ time : 2024-07-02_16:31:05
760
+ host : ip-26-0-160-192.ec2.internal
761
+ rank : 1 (local_rank: 1)
762
+ exitcode : 1 (pid: 867662)
763
+ error_file: <N/A>
764
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
765
+ ------------------------------------------------------------
766
+ Root Cause (first observed failure):
767
+ [0]:
768
+ time : 2024-07-02_16:31:05
769
+ host : ip-26-0-160-192.ec2.internal
770
+ rank : 0 (local_rank: 0)
771
+ exitcode : 1 (pid: 867661)
772
+ error_file: <N/A>
773
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
774
+ ============================================================
775
+ srun: error: ip-26-0-160-192: task 0: Exited with exit code 1
776
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-8/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom