3outeille HF staff commited on
Commit
0dbd815
1 Parent(s): 1b21f93

Upload llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4

Browse files
llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=00:59:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=2
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=high
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 2 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4 llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4 --commit-message "Upload llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 8
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 2
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 32
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 32
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 4
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/log.out ADDED
@@ -0,0 +1,930 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Tue Jul 2 16:31:21 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ Already on 'bench_cluster'
10
+ M examples/config_tiny_llama.py
11
+ M examples/config_tiny_llama.yaml
12
+ M examples/train_tiny_llama.sh
13
+ M src/nanotron/models/llama.py
14
+ M src/nanotron/trainer.py
15
+ Your branch is up to date with 'origin/bench_cluster'.
16
+ Job status: RUNNING
17
+ W0702 16:31:24.185000 140446723327808 torch/distributed/run.py:757]
18
+ W0702 16:31:24.185000 140446723327808 torch/distributed/run.py:757] *****************************************
19
+ W0702 16:31:24.185000 140446723327808 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0702 16:31:24.185000 140446723327808 torch/distributed/run.py:757] *****************************************
21
+ W0702 16:31:24.204000 139668155955008 torch/distributed/run.py:757]
22
+ W0702 16:31:24.204000 139668155955008 torch/distributed/run.py:757] *****************************************
23
+ W0702 16:31:24.204000 139668155955008 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
24
+ W0702 16:31:24.204000 139668155955008 torch/distributed/run.py:757] *****************************************
25
+ [default0]:07/02/2024 16:31:42 [WARNING|DP=0|PP=0|TP=0|ip-26-0-171-62]: [Vocab Size Padding] Padded vocab (size: 50257) with 1 dummy tokens (new size: 50258)
26
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: Config:
27
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: Config(general=GeneralArgs(project='bench_cluster',
28
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: run='%date_%jobid',
29
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: seed=42,
30
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: step=None,
31
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: consumed_train_samples=None,
32
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: benchmark_csv_path=None,
33
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: ignore_sanity_checks=True),
34
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: parallelism=ParallelismArgs(dp=8,
35
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: pp=1,
36
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: tp=2,
37
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7f3c75348790>,
38
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
39
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: tp_linear_async_communication=False,
40
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: expert_parallel_size=1),
41
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
42
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: eos_token_id=2,
43
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: hidden_act='silu',
44
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: hidden_size=2048,
45
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: initializer_range=0.02,
46
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: intermediate_size=4096,
47
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: is_llama_config=True,
48
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: max_position_embeddings=4096,
49
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: num_attention_heads=32,
50
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: num_hidden_layers=24,
51
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: num_key_value_heads=32,
52
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: pad_token_id=None,
53
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: pretraining_tp=1,
54
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: rms_norm_eps=1e-05,
55
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: rope_scaling=None,
56
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: rope_theta=10000.0,
57
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: tie_word_embeddings=True,
58
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: use_cache=True,
59
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: vocab_size=50258),
60
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: init_method=RandomInit(std=0.025),
61
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: dtype=torch.bfloat16,
62
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: make_vocab_size_divisible_by=1,
63
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: ddp_bucket_cap_mb=25),
64
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
65
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: tokenizer_revision=None,
66
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: tokenizer_max_length=None),
67
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
68
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: checkpoint_interval=100000,
69
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: save_initial_state=False,
70
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: resume_checkpoint_path=None,
71
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: checkpoints_path_is_shared_file_system=False),
72
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: logging=LoggingArgs(log_level='info',
73
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: log_level_replica='info',
74
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: iteration_step_info_interval=1),
75
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: tokens=TokensArgs(sequence_length=4096,
76
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: train_steps=20,
77
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: micro_batch_size=4,
78
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: batch_accumulation_per_replica=32,
79
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: val_check_interval=-1,
80
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: limit_val_batches=0,
81
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: limit_test_batches=0),
82
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
83
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: adam_beta1=0.9,
84
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: adam_beta2=0.95,
85
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: torch_adam_is_fused=True,
86
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: name='adamW'),
87
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: zero_stage=1,
88
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: weight_decay=0.01,
89
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: clip_grad=1.0,
90
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: accumulate_grad_in_fp32=True,
91
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
92
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: lr_warmup_steps=1,
93
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: lr_warmup_style='linear',
94
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: lr_decay_style='linear',
95
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: lr_decay_steps=19,
96
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: lr_decay_starting_step=None,
97
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: min_decay_lr=1e-05)),
98
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: data_stages=[DatasetStageArgs(name='Training Stage',
99
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: start_training_step=1,
100
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
101
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: hf_dataset_splits='train',
102
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: hf_dataset_config_name=None,
103
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: dataset_processing_num_proc_per_process=64,
104
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: dataset_overwrite_cache=False,
105
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: text_column_name='text'),
106
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: seed=42,
107
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: num_loading_workers=32))],
108
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4')),
109
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: lighteval=None)
110
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: Model Config:
111
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: LlamaConfig(bos_token_id=1,
112
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: eos_token_id=2,
113
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: hidden_act='silu',
114
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: hidden_size=2048,
115
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: initializer_range=0.02,
116
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: intermediate_size=4096,
117
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: is_llama_config=True,
118
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: max_position_embeddings=4096,
119
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: num_attention_heads=32,
120
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: num_hidden_layers=24,
121
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: num_key_value_heads=32,
122
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: pad_token_id=None,
123
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: pretraining_tp=1,
124
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: rms_norm_eps=1e-05,
125
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: rope_scaling=None,
126
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: rope_theta=10000.0,
127
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: tie_word_embeddings=True,
128
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: use_cache=True,
129
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: vocab_size=50258)
130
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: Building model..
131
+ [default0]:07/02/2024 16:31:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: Setting PP block ranks...
132
+ [default1]:07/02/2024 16:31:53 [INFO|DP=0|PP=0|TP=1|ip-26-0-171-62]: Local number of parameters: 555M (1058.35MiB)
133
+ [default1]:07/02/2024 16:31:53 [INFO|DP=0|PP=0|TP=1|ip-26-0-171-62]: [After model building] Memory usage: 1082.37MiB. Peak allocated: 1182.56MiB Peak reserved: 1200.00MiB
134
+ [default1]:07/02/2024 16:31:53 [INFO|DP=0|PP=0|TP=1|ip-26-0-171-62]: No checkpoint path provided.
135
+ [default0]:07/02/2024 16:31:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: Total number of parameters: 1.11G (2116.70MiB)
136
+ [default0]:07/02/2024 16:31:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: Local number of parameters: 555M (1058.35MiB)
137
+ [default0]:07/02/2024 16:31:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [After model building] Memory usage: 1082.37MiB. Peak allocated: 1182.56MiB Peak reserved: 1200.00MiB
138
+ [default0]:07/02/2024 16:31:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: No checkpoint path provided.
139
+ [default0]:07/02/2024 16:31:53 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: Parametrizing model parameters using StandardParametrizator
140
+ [default2]:07/02/2024 16:31:53 [INFO|DP=5|PP=0|TP=0|ip-26-0-171-88]: No checkpoint path provided.
141
+ [default3]:07/02/2024 16:31:53 [INFO|DP=5|PP=0|TP=1|ip-26-0-171-88]: No checkpoint path provided.
142
+ [default0]:07/02/2024 16:31:53 [INFO|DP=4|PP=0|TP=0|ip-26-0-171-88]: No checkpoint path provided.
143
+ [default1]:07/02/2024 16:31:53 [INFO|DP=4|PP=0|TP=1|ip-26-0-171-88]: No checkpoint path provided.
144
+ [default5]:07/02/2024 16:31:53 [INFO|DP=2|PP=0|TP=1|ip-26-0-171-62]: No checkpoint path provided.
145
+ [default3]:07/02/2024 16:31:53 [INFO|DP=1|PP=0|TP=1|ip-26-0-171-62]: No checkpoint path provided.
146
+ [default2]:07/02/2024 16:31:53 [INFO|DP=1|PP=0|TP=0|ip-26-0-171-62]: No checkpoint path provided.
147
+ [default4]:07/02/2024 16:31:53 [INFO|DP=2|PP=0|TP=0|ip-26-0-171-62]: No checkpoint path provided.
148
+ [default7]:07/02/2024 16:31:53 [INFO|DP=7|PP=0|TP=1|ip-26-0-171-88]: No checkpoint path provided.
149
+ [default6]:07/02/2024 16:31:53 [INFO|DP=7|PP=0|TP=0|ip-26-0-171-88]: No checkpoint path provided.
150
+ [default5]:07/02/2024 16:31:53 [INFO|DP=6|PP=0|TP=1|ip-26-0-171-88]: No checkpoint path provided.
151
+ [default4]:07/02/2024 16:31:53 [INFO|DP=6|PP=0|TP=0|ip-26-0-171-88]: No checkpoint path provided.
152
+ [default7]:07/02/2024 16:31:53 [INFO|DP=3|PP=0|TP=1|ip-26-0-171-62]: No checkpoint path provided.
153
+ [default6]:07/02/2024 16:31:53 [INFO|DP=3|PP=0|TP=0|ip-26-0-171-62]: No checkpoint path provided.
154
+ [default0]:07/02/2024 16:31:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [Optimizer Building] Using LearningRateForSP as learning rate
155
+ [default0]:07/02/2024 16:31:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [ZeRO sharding] Size of optimizer params per rank:
156
+ [default0]:07/02/2024 16:31:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [ZeRO sharding] DP Rank 0 has 69.4M out of 555M (12.50%) params' optimizer states
157
+ [default0]:07/02/2024 16:31:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [ZeRO sharding] DP Rank 1 has 69.4M out of 555M (12.50%) params' optimizer states
158
+ [default0]:07/02/2024 16:31:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [ZeRO sharding] DP Rank 2 has 69.4M out of 555M (12.50%) params' optimizer states
159
+ [default0]:07/02/2024 16:31:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [ZeRO sharding] DP Rank 3 has 69.4M out of 555M (12.50%) params' optimizer states
160
+ [default0]:07/02/2024 16:31:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [ZeRO sharding] DP Rank 4 has 69.4M out of 555M (12.50%) params' optimizer states
161
+ [default0]:07/02/2024 16:31:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [ZeRO sharding] DP Rank 5 has 69.4M out of 555M (12.50%) params' optimizer states
162
+ [default0]:07/02/2024 16:31:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [ZeRO sharding] DP Rank 6 has 69.4M out of 555M (12.50%) params' optimizer states
163
+ [default0]:07/02/2024 16:31:58 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [ZeRO sharding] DP Rank 7 has 69.4M out of 555M (12.50%) params' optimizer states
164
+ [default0]:07/02/2024 16:32:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
165
+ [default0]:07/02/2024 16:32:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: Using `datasets` library
166
+ [default0]:07/02/2024 16:32:00 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
167
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
168
+ [default0]:07/02/2024 16:32:01 [WARNING|DP=0|PP=0|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty.
169
+ [default0]:07/02/2024 16:32:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [Training Plan] There are 1 training stages
170
+ [default0]:07/02/2024 16:32:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [Stage Training Stage] start from step 1
171
+ [default0]:07/02/2024 16:32:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]:
172
+ [default0]:07/02/2024 16:32:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: [Start training] datetime: 2024-07-02 16:32:01.872683 | mbs: 4 | grad_accum: 32 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
173
+ [default0]:07/02/2024 16:32:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
174
+ [default0]:07/02/2024 16:32:01 [INFO|DP=0|PP=0|TP=0|ip-26-0-171-62]: Memory usage: 3463.66MiB. Peak allocated 3463.66MiB. Peak reserved: 3584.00MiB
175
+ [default4]:07/02/2024 16:32:02 [WARNING|DP=2|PP=0|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty.
176
+ [default5]:07/02/2024 16:32:02 [WARNING|DP=2|PP=0|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty.
177
+ [default0]:07/02/2024 16:32:02 [WARNING|DP=4|PP=0|TP=0|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
178
+ [default1]:07/02/2024 16:32:02 [WARNING|DP=0|PP=0|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty.
179
+ [default6]:07/02/2024 16:32:02 [WARNING|DP=7|PP=0|TP=0|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
180
+ [default2]:07/02/2024 16:32:02 [WARNING|DP=5|PP=0|TP=0|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
181
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
182
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
183
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
184
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
185
+ [default4]:07/02/2024 16:32:02 [WARNING|DP=6|PP=0|TP=0|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
186
+ [default5]:07/02/2024 16:32:02 [WARNING|DP=6|PP=0|TP=1|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
187
+ [default6]:07/02/2024 16:32:02 [WARNING|DP=3|PP=0|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty.
188
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
189
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
190
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
191
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
192
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
193
+ [default3]:07/02/2024 16:32:02 [WARNING|DP=1|PP=0|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty.
194
+ [default2]:07/02/2024 16:32:02 [WARNING|DP=1|PP=0|TP=0|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty.
195
+ [default1]:07/02/2024 16:32:02 [WARNING|DP=4|PP=0|TP=1|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
196
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
197
+ [default3]:07/02/2024 16:32:02 [WARNING|DP=5|PP=0|TP=1|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
198
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
199
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
200
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
201
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
202
+ [default7]:07/02/2024 16:32:02 [WARNING|DP=7|PP=0|TP=1|ip-26-0-171-88]: Repo card metadata block was not found. Setting CardData to empty.
203
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
204
+ [default7]:07/02/2024 16:32:02 [WARNING|DP=3|PP=0|TP=1|ip-26-0-171-62]: Repo card metadata block was not found. Setting CardData to empty.
205
+ [default0]:[rank0]: OSError: [Errno 122] Disk quota exceeded
206
+ [default0]:
207
+ [default0]:[rank0]: During handling of the above exception, another exception occurred:
208
+ [default0]:
209
+ [default0]:[rank0]: Traceback (most recent call last):
210
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
211
+ [default0]:[rank0]: trainer.train(dataloader)
212
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
213
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
214
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
215
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
216
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
217
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
218
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
219
+ [default0]:[rank0]: output = model(**micro_batch)
220
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
221
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
222
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
223
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
224
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
225
+ [default0]:[rank0]: sharded_logits = self.model(
226
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
227
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
228
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
229
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
230
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
231
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
232
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
233
+ [default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
234
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
235
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
236
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
237
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
238
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
239
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
240
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
241
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
242
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
243
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
244
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 629, in forward
245
+ [default0]:[rank0]: hidden_states = self.input_layernorm(hidden_states)
246
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
247
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
248
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
249
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
250
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/layer_norm.py", line 42, in forward
251
+ [default0]:[rank0]: return layer_norm_fn(
252
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 875, in layer_norm_fn
253
+ [default0]:[rank0]: return LayerNormFn.apply(
254
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
255
+ [default0]:[rank0]: return super().apply(*args, **kwargs) # type: ignore[misc]
256
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 748, in forward
257
+ [default0]:[rank0]: y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
258
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 335, in _layer_norm_fwd
259
+ [default0]:[rank0]: _layer_norm_fwd_1pass_kernel[(M,)](
260
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in <lambda>
261
+ [default0]:[rank0]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
262
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in run
263
+ [default0]:[rank0]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
264
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in <dictcomp>
265
+ [default0]:[rank0]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
266
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 122, in _bench
267
+ [default0]:[rank0]: return do_bench(kernel_call, warmup=self.warmup, rep=self.rep, quantiles=(0.5, 0.2, 0.8))
268
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/testing.py", line 102, in do_bench
269
+ [default0]:[rank0]: fn()
270
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 110, in kernel_call
271
+ [default0]:[rank0]: self.fn.run(
272
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
273
+ [default0]:[rank0]: return self.fn.run(*args, **kwargs)
274
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
275
+ [default0]:[rank0]: return self.fn.run(*args, **kwargs)
276
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
277
+ [default0]:[rank0]: return self.fn.run(*args, **kwargs)
278
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run
279
+ [default0]:[rank0]: self.cache[device][key] = compile(
280
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 194, in compile
281
+ [default0]:[rank0]: metadata_group[f"{src.name}.{ext}"] = fn_cache_manager.put(next_module, f"{src.name}.{ext}")
282
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/cache.py", line 123, in put
283
+ [default0]:[rank0]: with open(temp_path, mode) as f:
284
+ [default0]:[rank0]: OSError: [Errno 122] Disk quota exceeded
285
+ [default5]:[rank5]: OSError: [Errno 122] Disk quota exceeded
286
+ [default5]:
287
+ [default5]:[rank5]: During handling of the above exception, another exception occurred:
288
+ [default5]:
289
+ [default5]:[rank5]: Traceback (most recent call last):
290
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
291
+ [default5]:[rank5]: trainer.train(dataloader)
292
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
293
+ [default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
294
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
295
+ [default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter(
296
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
297
+ [default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
298
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
299
+ [default5]:[rank5]: output = model(**micro_batch)
300
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
301
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
302
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
303
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
304
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
305
+ [default5]:[rank5]: sharded_logits = self.model(
306
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
307
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
308
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
309
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
310
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
311
+ [default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
312
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
313
+ [default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
314
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
315
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
316
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
317
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
318
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
319
+ [default5]:[rank5]: output = self.pp_block(**new_kwargs)
320
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
321
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
322
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
323
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
324
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 629, in forward
325
+ [default5]:[rank5]: hidden_states = self.input_layernorm(hidden_states)
326
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
327
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
328
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
329
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
330
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/layer_norm.py", line 42, in forward
331
+ [default5]:[rank5]: return layer_norm_fn(
332
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 875, in layer_norm_fn
333
+ [default5]:[rank5]: return LayerNormFn.apply(
334
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
335
+ [default5]:[rank5]: return super().apply(*args, **kwargs) # type: ignore[misc]
336
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 748, in forward
337
+ [default5]:[rank5]: y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
338
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 335, in _layer_norm_fwd
339
+ [default5]:[rank5]: _layer_norm_fwd_1pass_kernel[(M,)](
340
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in <lambda>
341
+ [default5]:[rank5]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
342
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in run
343
+ [default5]:[rank5]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
344
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in <dictcomp>
345
+ [default5]:[rank5]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
346
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 122, in _bench
347
+ [default5]:[rank5]: return do_bench(kernel_call, warmup=self.warmup, rep=self.rep, quantiles=(0.5, 0.2, 0.8))
348
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/testing.py", line 102, in do_bench
349
+ [default5]:[rank5]: fn()
350
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 110, in kernel_call
351
+ [default5]:[rank5]: self.fn.run(
352
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
353
+ [default5]:[rank5]: return self.fn.run(*args, **kwargs)
354
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
355
+ [default5]:[rank5]: return self.fn.run(*args, **kwargs)
356
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
357
+ [default5]:[rank5]: return self.fn.run(*args, **kwargs)
358
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run
359
+ [default5]:[rank5]: self.cache[device][key] = compile(
360
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 194, in compile
361
+ [default5]:[rank5]: metadata_group[f"{src.name}.{ext}"] = fn_cache_manager.put(next_module, f"{src.name}.{ext}")
362
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/cache.py", line 123, in put
363
+ [default5]:[rank5]: with open(temp_path, mode) as f:
364
+ [default5]:[rank5]: OSError: [Errno 122] Disk quota exceeded
365
+ [default3]:[rank3]: OSError: [Errno 122] Disk quota exceeded
366
+ [default3]:
367
+ [default3]:[rank3]: During handling of the above exception, another exception occurred:
368
+ [default3]:
369
+ [default3]:[rank3]: Traceback (most recent call last):
370
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
371
+ [default3]:[rank3]: trainer.train(dataloader)
372
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
373
+ [default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
374
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
375
+ [default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter(
376
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
377
+ [default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
378
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
379
+ [default3]:[rank3]: output = model(**micro_batch)
380
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
381
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
382
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
383
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
384
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
385
+ [default3]:[rank3]: sharded_logits = self.model(
386
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
387
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
388
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
389
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
390
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
391
+ [default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
392
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
393
+ [default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
394
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
395
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
396
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
397
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
398
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
399
+ [default3]:[rank3]: output = self.pp_block(**new_kwargs)
400
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
401
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
402
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
403
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
404
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 629, in forward
405
+ [default3]:[rank3]: hidden_states = self.input_layernorm(hidden_states)
406
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
407
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
408
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
409
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
410
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/nn/layer_norm.py", line 42, in forward
411
+ [default3]:[rank3]: return layer_norm_fn(
412
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 875, in layer_norm_fn
413
+ [default3]:[rank3]: return LayerNormFn.apply(
414
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
415
+ [default3]:[rank3]: return super().apply(*args, **kwargs) # type: ignore[misc]
416
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 748, in forward
417
+ [default3]:[rank3]: y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
418
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 335, in _layer_norm_fwd
419
+ [default3]:[rank3]: _layer_norm_fwd_1pass_kernel[(M,)](
420
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in <lambda>
421
+ [default3]:[rank3]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
422
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in run
423
+ [default3]:[rank3]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
424
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in <dictcomp>
425
+ [default3]:[rank3]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
426
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 122, in _bench
427
+ [default3]:[rank3]: return do_bench(kernel_call, warmup=self.warmup, rep=self.rep, quantiles=(0.5, 0.2, 0.8))
428
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/testing.py", line 102, in do_bench
429
+ [default3]:[rank3]: fn()
430
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 110, in kernel_call
431
+ [default3]:[rank3]: self.fn.run(
432
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
433
+ [default3]:[rank3]: return self.fn.run(*args, **kwargs)
434
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
435
+ [default3]:[rank3]: return self.fn.run(*args, **kwargs)
436
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
437
+ [default3]:[rank3]: return self.fn.run(*args, **kwargs)
438
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run
439
+ [default3]:[rank3]: self.cache[device][key] = compile(
440
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 194, in compile
441
+ [default3]:[rank3]: metadata_group[f"{src.name}.{ext}"] = fn_cache_manager.put(next_module, f"{src.name}.{ext}")
442
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/cache.py", line 123, in put
443
+ [default3]:[rank3]: with open(temp_path, mode) as f:
444
+ [default3]:[rank3]: OSError: [Errno 122] Disk quota exceeded
445
+ [default0]:[rank8]: OSError: [Errno 122] Disk quota exceeded
446
+ [default0]:
447
+ [default0]:[rank8]: During handling of the above exception, another exception occurred:
448
+ [default0]:
449
+ [default0]:[rank8]: Traceback (most recent call last):
450
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
451
+ [default0]:[rank8]: trainer.train(dataloader)
452
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
453
+ [default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
454
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
455
+ [default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter(
456
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
457
+ [default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
458
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
459
+ [default0]:[rank8]: output = model(**micro_batch)
460
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
461
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
462
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
463
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
464
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
465
+ [default0]:[rank8]: sharded_logits = self.model(
466
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
467
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
468
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
469
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
470
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
471
+ [default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
472
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
473
+ [default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
474
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
475
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
476
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
477
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
478
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
479
+ [default0]:[rank8]: output = self.pp_block(**new_kwargs)
480
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
481
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
482
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
483
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
484
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward
485
+ [default0]:[rank8]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
486
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
487
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
488
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
489
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
490
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 566, in forward
491
+ [default0]:[rank8]: query_states, key_value_states = self.flash_rotary_embedding(query_states, kv=key_value_states)
492
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
493
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
494
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
495
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
496
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 457, in forward
497
+ [default0]:[rank8]: q = apply_rotary_emb_func(
498
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 122, in apply_rotary_emb
499
+ [default0]:[rank8]: return ApplyRotaryEmb.apply(
500
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
501
+ [default0]:[rank8]: return super().apply(*args, **kwargs) # type: ignore[misc]
502
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 48, in forward
503
+ [default0]:[rank8]: out = apply_rotary(
504
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/rotary.py", line 202, in apply_rotary
505
+ [default0]:[rank8]: rotary_kernel[grid](
506
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in <lambda>
507
+ [default0]:[rank8]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
508
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run
509
+ [default0]:[rank8]: self.cache[device][key] = compile(
510
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 194, in compile
511
+ [default0]:[rank8]: metadata_group[f"{src.name}.{ext}"] = fn_cache_manager.put(next_module, f"{src.name}.{ext}")
512
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/cache.py", line 123, in put
513
+ [default0]:[rank8]: with open(temp_path, mode) as f:
514
+ [default0]:[rank8]: OSError: [Errno 122] Disk quota exceeded
515
+ [default7]:[rank15]: OSError: [Errno 122] Disk quota exceeded
516
+ [default7]:
517
+ [default7]:[rank15]: During handling of the above exception, another exception occurred:
518
+ [default7]:
519
+ [default7]:[rank15]: Traceback (most recent call last):
520
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
521
+ [default6]:[rank14]: OSError: [Errno 122] Disk quota exceeded
522
+ [default6]:
523
+ [default7]:[rank15]: trainer.train(dataloader)
524
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
525
+ [default7]:[rank15]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
526
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
527
+ [default7]:[rank15]: outputs = self.pipeline_engine.train_batch_iter(
528
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
529
+ [default7]:[rank15]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
530
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
531
+ [default7]:[rank15]: output = model(**micro_batch)
532
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
533
+ [default7]:[rank15]: return self._call_impl(*args, **kwargs)
534
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
535
+ [default7]:[rank15]: return forward_call(*args, **kwargs)
536
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
537
+ [default7]:[rank15]: sharded_logits = self.model(
538
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
539
+ [default7]:[rank15]: return self._call_impl(*args, **kwargs)
540
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
541
+ [default7]:[rank15]: return forward_call(*args, **kwargs)
542
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
543
+ [default7]:[rank15]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
544
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
545
+ [default7]:[rank15]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
546
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
547
+ [default6]:[rank14]: During handling of the above exception, another exception occurred:
548
+ [default6]:
549
+ [default6]:[rank14]: Traceback (most recent call last):
550
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
551
+ [default6]:[rank14]: trainer.train(dataloader)
552
+ [default7]:[rank15]: return self._call_impl(*args, **kwargs)
553
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
554
+ [default7]:[rank15]: return forward_call(*args, **kwargs)
555
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
556
+ [default7]:[rank15]: output = self.pp_block(**new_kwargs)
557
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
558
+ [default7]:[rank15]: return self._call_impl(*args, **kwargs)
559
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
560
+ [default7]:[rank15]: return forward_call(*args, **kwargs)
561
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward
562
+ [default7]:[rank15]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
563
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
564
+ [default7]:[rank15]: return self._call_impl(*args, **kwargs)
565
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
566
+ [default7]:[rank15]: return forward_call(*args, **kwargs)
567
+ [default7]:[rank15]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 566, in forward
568
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
569
+ [default6]:[rank14]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
570
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
571
+ [default7]:[rank15]: query_states, key_value_states = self.flash_rotary_embedding(query_states, kv=key_value_states)
572
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
573
+ [default6]:[rank14]: outputs = self.pipeline_engine.train_batch_iter(
574
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
575
+ [default6]:[rank14]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
576
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
577
+ [default6]:[rank14]: output = model(**micro_batch)
578
+ [default7]:[rank15]: return self._call_impl(*args, **kwargs)
579
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
580
+ [default6]:[rank14]: return self._call_impl(*args, **kwargs)
581
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
582
+ [default6]:[rank14]: return forward_call(*args, **kwargs)
583
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
584
+ [default6]:[rank14]: sharded_logits = self.model(
585
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
586
+ [default6]:[rank14]: return self._call_impl(*args, **kwargs)
587
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
588
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
589
+ [default7]:[rank15]: return forward_call(*args, **kwargs)
590
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 457, in forward
591
+ [default6]:[rank14]: return forward_call(*args, **kwargs)
592
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
593
+ [default6]:[rank14]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
594
+ [default7]:[rank15]: q = apply_rotary_emb_func(
595
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 122, in apply_rotary_emb
596
+ [default7]:[rank15]: return ApplyRotaryEmb.apply(
597
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
598
+ [default7]:[rank15]: return super().apply(*args, **kwargs) # type: ignore[misc]
599
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
600
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 48, in forward
601
+ [default7]:[rank15]: out = apply_rotary(
602
+ [default6]:[rank14]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
603
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
604
+ [default6]:[rank14]: return self._call_impl(*args, **kwargs)
605
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
606
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/rotary.py", line 202, in apply_rotary
607
+ [default7]:[rank15]: rotary_kernel[grid](
608
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in <lambda>
609
+ [default6]:[rank14]: return forward_call(*args, **kwargs)
610
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
611
+ [default6]:[rank14]: output = self.pp_block(**new_kwargs)
612
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
613
+ [default6]:[rank14]: return self._call_impl(*args, **kwargs)
614
+ [default7]:[rank15]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
615
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run
616
+ [default7]:[rank15]: self.cache[device][key] = compile(
617
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 194, in compile
618
+ [default7]:[rank15]: metadata_group[f"{src.name}.{ext}"] = fn_cache_manager.put(next_module, f"{src.name}.{ext}")
619
+ [default7]:[rank15]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/cache.py", line 123, in put
620
+ [default7]:[rank15]: with open(temp_path, mode) as f:
621
+ [default7]:[rank15]: OSError: [Errno 122] Disk quota exceeded
622
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
623
+ [default6]:[rank14]: return forward_call(*args, **kwargs)
624
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward
625
+ [default6]:[rank14]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
626
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
627
+ [default6]:[rank14]: return self._call_impl(*args, **kwargs)
628
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
629
+ [default6]:[rank14]: return forward_call(*args, **kwargs)
630
+ [default6]:[rank14]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 566, in forward
631
+ [default6]:[rank14]: query_states, key_value_states = self.flash_rotary_embedding(query_states, kv=key_value_states)
632
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
633
+ [default6]:[rank14]: return self._call_impl(*args, **kwargs)
634
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
635
+ [default6]:[rank14]: return forward_call(*args, **kwargs)
636
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 457, in forward
637
+ [default6]:[rank14]: q = apply_rotary_emb_func(
638
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 122, in apply_rotary_emb
639
+ [default6]:[rank14]: return ApplyRotaryEmb.apply(
640
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
641
+ [default6]:[rank14]: return super().apply(*args, **kwargs) # type: ignore[misc]
642
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 48, in forward
643
+ [default6]:[rank14]: out = apply_rotary(
644
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/rotary.py", line 202, in apply_rotary
645
+ [default6]:[rank14]: rotary_kernel[grid](
646
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in <lambda>
647
+ [default6]:[rank14]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
648
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run
649
+ [default6]:[rank14]: self.cache[device][key] = compile(
650
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 194, in compile
651
+ [default6]:[rank14]: metadata_group[f"{src.name}.{ext}"] = fn_cache_manager.put(next_module, f"{src.name}.{ext}")
652
+ [default6]:[rank14]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/cache.py", line 123, in put
653
+ [default6]:[rank14]: with open(temp_path, mode) as f:
654
+ [default6]:[rank14]: OSError: [Errno 122] Disk quota exceeded
655
+ [default3]:[rank11]: OSError: [Errno 122] Disk quota exceeded
656
+ [default3]:
657
+ [default3]:[rank11]: During handling of the above exception, another exception occurred:
658
+ [default3]:
659
+ [default3]:[rank11]: Traceback (most recent call last):
660
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
661
+ [default3]:[rank11]: trainer.train(dataloader)
662
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
663
+ [default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
664
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
665
+ [default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter(
666
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter
667
+ [default3]:[rank11]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator)
668
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward
669
+ [default3]:[rank11]: grad_accumulator.backward(sum(activations))
670
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward
671
+ [default3]:[rank11]: result = loss.backward()
672
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward
673
+ [default3]:[rank11]: torch.autograd.backward(
674
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward
675
+ [default3]:[rank11]: _engine_run_backward(
676
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward
677
+ [default3]:[rank11]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
678
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply
679
+ [default3]:[rank11]: return user_fn(self, *args)
680
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 821, in backward
681
+ [default3]:[rank11]: dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
682
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/layer_norm.py", line 643, in _layer_norm_bwd
683
+ [default3]:[rank11]: _layer_norm_bwd_kernel[grid](
684
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in <lambda>
685
+ [default3]:[rank11]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
686
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in run
687
+ [default3]:[rank11]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
688
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 143, in <dictcomp>
689
+ [default3]:[rank11]: timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
690
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 122, in _bench
691
+ [default3]:[rank11]: return do_bench(kernel_call, warmup=self.warmup, rep=self.rep, quantiles=(0.5, 0.2, 0.8))
692
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/testing.py", line 102, in do_bench
693
+ [default3]:[rank11]: fn()
694
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 110, in kernel_call
695
+ [default3]:[rank11]: self.fn.run(
696
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
697
+ [default3]:[rank11]: return self.fn.run(*args, **kwargs)
698
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
699
+ [default3]:[rank11]: return self.fn.run(*args, **kwargs)
700
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 305, in run
701
+ [default3]:[rank11]: return self.fn.run(*args, **kwargs)
702
+ [default3]:[rank11]: [Previous line repeated 2 more times]
703
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run
704
+ [default3]:[rank11]: self.cache[device][key] = compile(
705
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 194, in compile
706
+ [default3]:[rank11]: metadata_group[f"{src.name}.{ext}"] = fn_cache_manager.put(next_module, f"{src.name}.{ext}")
707
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/cache.py", line 123, in put
708
+ [default3]:[rank11]: with open(temp_path, mode) as f:
709
+ [default3]:[rank11]: OSError: [Errno 122] Disk quota exceeded
710
+ [default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
711
+ [default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
712
+ [default4]:[rank12]: OSError: [Errno 122] Disk quota exceeded
713
+ [default4]:
714
+ [default4]:[rank12]: During handling of the above exception, another exception occurred:
715
+ [default4]:
716
+ [default4]:[rank12]: Traceback (most recent call last):
717
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
718
+ [default4]:[rank12]: trainer.train(dataloader)
719
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
720
+ [default4]:[rank12]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
721
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
722
+ [default4]:[rank12]: outputs = self.pipeline_engine.train_batch_iter(
723
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 295, in train_batch_iter
724
+ [default4]:[rank12]: self.backward(context=context, state=state, grad_accumulator=grad_accumulator)
725
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 86, in backward
726
+ [default4]:[rank12]: grad_accumulator.backward(sum(activations))
727
+ [default4]:[rank12]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/optim/gradient_accumulator.py", line 205, in backward
728
+ [default4]:[rank12]: result = loss.backward()
729
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward
730
+ [default4]:[rank12]: torch.autograd.backward(
731
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward
732
+ [default4]:[rank12]: _engine_run_backward(
733
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward
734
+ [default4]:[rank12]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
735
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply
736
+ [default4]:[rank12]: return user_fn(self, *args)
737
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 261, in backward
738
+ [default4]:[rank12]: apply_rotary(
739
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/rotary.py", line 202, in apply_rotary
740
+ [default4]:[rank12]: rotary_kernel[grid](
741
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in <lambda>
742
+ [default4]:[rank12]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
743
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run
744
+ [default4]:[rank12]: self.cache[device][key] = compile(
745
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 194, in compile
746
+ [default4]:[rank12]: metadata_group[f"{src.name}.{ext}"] = fn_cache_manager.put(next_module, f"{src.name}.{ext}")
747
+ [default4]:[rank12]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/cache.py", line 123, in put
748
+ [default4]:[rank12]: with open(temp_path, mode) as f:
749
+ [default4]:[rank12]: OSError: [Errno 122] Disk quota exceeded
750
+ [default6]:[rank6]: OSError: [Errno 122] Disk quota exceeded
751
+ [default6]:
752
+ [default6]:[rank6]: During handling of the above exception, another exception occurred:
753
+ [default6]:
754
+ [default6]:[rank6]: Traceback (most recent call last):
755
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
756
+ [default6]:[rank6]: trainer.train(dataloader)
757
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
758
+ [default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
759
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
760
+ [default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter(
761
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
762
+ [default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
763
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
764
+ [default6]:[rank6]: output = model(**micro_batch)
765
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
766
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
767
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
768
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
769
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
770
+ [default6]:[rank6]: sharded_logits = self.model(
771
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
772
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
773
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
774
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
775
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
776
+ [default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
777
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
778
+ [default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
779
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
780
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
781
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
782
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
783
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
784
+ [default6]:[rank6]: output = self.pp_block(**new_kwargs)
785
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
786
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
787
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
788
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
789
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 631, in forward
790
+ [default6]:[rank6]: output = self.attn(hidden_states=hidden_states, sequence_mask=sequence_mask)
791
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
792
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
793
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
794
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
795
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 566, in forward
796
+ [default6]:[rank6]: query_states, key_value_states = self.flash_rotary_embedding(query_states, kv=key_value_states)
797
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
798
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
799
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
800
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
801
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 457, in forward
802
+ [default6]:[rank6]: q = apply_rotary_emb_func(
803
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 122, in apply_rotary_emb
804
+ [default6]:[rank6]: return ApplyRotaryEmb.apply(
805
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
806
+ [default6]:[rank6]: return super().apply(*args, **kwargs) # type: ignore[misc]
807
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/layers/rotary.py", line 48, in forward
808
+ [default6]:[rank6]: out = apply_rotary(
809
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/flash_attn/ops/triton/rotary.py", line 202, in apply_rotary
810
+ [default6]:[rank6]: rotary_kernel[grid](
811
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 167, in <lambda>
812
+ [default6]:[rank6]: return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
813
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/jit.py", line 416, in run
814
+ [default6]:[rank6]: self.cache[device][key] = compile(
815
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/compiler/compiler.py", line 194, in compile
816
+ [default6]:[rank6]: metadata_group[f"{src.name}.{ext}"] = fn_cache_manager.put(next_module, f"{src.name}.{ext}")
817
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/triton/runtime/cache.py", line 123, in put
818
+ [default6]:[rank6]: with open(temp_path, mode) as f:
819
+ [default6]:[rank6]: OSError: [Errno 122] Disk quota exceeded
820
+ W0702 16:32:10.570000 139668155955008 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3409968 closing signal SIGTERM
821
+ W0702 16:32:10.574000 139668155955008 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3409969 closing signal SIGTERM
822
+ W0702 16:32:10.576000 140446723327808 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 403560 closing signal SIGTERM
823
+ W0702 16:32:10.580000 140446723327808 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 403561 closing signal SIGTERM
824
+ W0702 16:32:10.581000 139668155955008 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3409971 closing signal SIGTERM
825
+ W0702 16:32:10.586000 140446723327808 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 403564 closing signal SIGTERM
826
+ W0702 16:32:10.602000 139668155955008 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3409973 closing signal SIGTERM
827
+ W0702 16:32:10.603000 139668155955008 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 3409974 closing signal SIGTERM
828
+ E0702 16:32:12.295000 140446723327808 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 403559) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
829
+ Traceback (most recent call last):
830
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
831
+ sys.exit(main())
832
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
833
+ return f(*args, **kwargs)
834
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
835
+ run(args)
836
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
837
+ elastic_launch(
838
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
839
+ return launch_agent(self._config, self._entrypoint, list(args))
840
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
841
+ raise ChildFailedError(
842
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
843
+ ============================================================
844
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
845
+ ------------------------------------------------------------
846
+ Failures:
847
+ [1]:
848
+ time : 2024-07-02_16:32:10
849
+ host : ip-26-0-171-88.ec2.internal
850
+ rank : 11 (local_rank: 3)
851
+ exitcode : 1 (pid: 403562)
852
+ error_file: <N/A>
853
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
854
+ [2]:
855
+ time : 2024-07-02_16:32:10
856
+ host : ip-26-0-171-88.ec2.internal
857
+ rank : 12 (local_rank: 4)
858
+ exitcode : 1 (pid: 403563)
859
+ error_file: <N/A>
860
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
861
+ [3]:
862
+ time : 2024-07-02_16:32:10
863
+ host : ip-26-0-171-88.ec2.internal
864
+ rank : 14 (local_rank: 6)
865
+ exitcode : 1 (pid: 403565)
866
+ error_file: <N/A>
867
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
868
+ [4]:
869
+ time : 2024-07-02_16:32:10
870
+ host : ip-26-0-171-88.ec2.internal
871
+ rank : 15 (local_rank: 7)
872
+ exitcode : 1 (pid: 403566)
873
+ error_file: <N/A>
874
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
875
+ ------------------------------------------------------------
876
+ Root Cause (first observed failure):
877
+ [0]:
878
+ time : 2024-07-02_16:32:10
879
+ host : ip-26-0-171-88.ec2.internal
880
+ rank : 8 (local_rank: 0)
881
+ exitcode : 1 (pid: 403559)
882
+ error_file: <N/A>
883
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
884
+ ============================================================
885
+ E0702 16:32:12.506000 139668155955008 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 3409967) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
886
+ Traceback (most recent call last):
887
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
888
+ sys.exit(main())
889
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
890
+ return f(*args, **kwargs)
891
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
892
+ run(args)
893
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
894
+ elastic_launch(
895
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
896
+ return launch_agent(self._config, self._entrypoint, list(args))
897
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
898
+ raise ChildFailedError(
899
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
900
+ ============================================================
901
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
902
+ ------------------------------------------------------------
903
+ Failures:
904
+ [1]:
905
+ time : 2024-07-02_16:32:10
906
+ host : ip-26-0-171-62.ec2.internal
907
+ rank : 3 (local_rank: 3)
908
+ exitcode : 1 (pid: 3409970)
909
+ error_file: <N/A>
910
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
911
+ [2]:
912
+ time : 2024-07-02_16:32:10
913
+ host : ip-26-0-171-62.ec2.internal
914
+ rank : 5 (local_rank: 5)
915
+ exitcode : 1 (pid: 3409972)
916
+ error_file: <N/A>
917
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
918
+ ------------------------------------------------------------
919
+ Root Cause (first observed failure):
920
+ [0]:
921
+ time : 2024-07-02_16:32:10
922
+ host : ip-26-0-171-62.ec2.internal
923
+ rank : 0 (local_rank: 0)
924
+ exitcode : 1 (pid: 3409967)
925
+ error_file: <N/A>
926
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
927
+ ============================================================
928
+ srun: error: ip-26-0-171-88: task 1: Exited with exit code 1
929
+ srun: error: ip-26-0-171-62: task 0: Exited with exit code 1
930
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/16_GPUS/dp-8_tp-2_pp-1_mbz-4/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ fail